import os import re from datasets import concatenate_datasets, load_dataset from reasoning_commons import decontaminate, map_to_sharegpt def convert(x):  question_pattern = r".*<\|end_header_id\|>\s*(.+?)<\|eot_id\|>"  answer_pattern = r"(.+?)<\|eot_id\|>"  question_match = re.search(question_pattern, x["input"], re.DOTALL)  answer_match = re.search(answer_pattern, x["output"], re.DOTALL)  x["question"] = question_match.group(1)  x["answer"] = answer_match.group(1)  x["conversations"] = [  {"from": "user", "value": x["question"]},  {"from": "assistant", "value": x["answer"]},  ]  x["system"] = f"detailed thinking {x['reasoning']}"  return x s = ["code", "math", "science", "chat", "safety"] ds = load_dataset("nvidia/Llama-Nemotron-Post-Training-Dataset-v1", "SFT", spt=s) ds = concatenate_datasets(ds) ds = ds.filter(lambda x: x["used_in_training"] == "yes", num_proc=os.cpu_count()) ds = ds.map(convert, num_proc=os.cpu_count()) ds = ds.shuffle(seed=42) ds = ds.remove_columns(["input", "output", "cense", "used_in_training"]) # for n in [1_000, 3_000, 10_000, 30_000, 100_000, 300_000, 1_000_000, len(ds)]: # ds.take(n).push_to_hub(f"-dev/nemotron-sft_{n}") ds = ds.filter(lambda x: "R1" in x["generator"], num_proc=os.cpu_count()) ds.push_to_hub(f"-dev/nemotron-sft_R1_{len(ds)}") 