import os from functools import partial from datasets import concatenate_datasets, load_dataset from reasoning_commons import decontaminate, map_to_sharegpt # Open-Thoughts-114k ds = load_dataset("open-thoughts/Open-Thoughts-114k", "metadata", spt="train") ds = ds.rename_column("problem", "question") ds = ds.map(  partial(  map_to_sharegpt,  question_col="question",  reasoning_col="deepseek_reasoning",  solution_col="deepseek_solution",  ),  num_proc=os.cpu_count(), ) total = len(ds) domains = ds.unique("domain") budgets = [1_000, 3_000, 10_000, 30_000, 100_000, total] for budget in budgets:  domains_ds = []  for domain in domains:  domain_ds = ds.filter(lambda x: x["domain"] == domain, num_proc=os.cpu_count())  domain_total = len(domain_ds)  domain_ds = domain_ds.shuffle(seed=42)  if budget != total:  ratio = float(domain_total) / total  domain_ds = domain_ds.take(int(budget * ratio))  domains_ds.append(domain_ds)  mix = concatenate_datasets(domains_ds)  mix = mix.select_columns(["conversations", "domain"])  print(f"budget: {budget}, used: {len(mix):,}")  mix.push_to_hub(f"-dev/openthoughts_{budget}") print("All datasets created: ") for budget in budgets:  print(f"-dev/openthoughts_{budget}") 