import os from functools import partial from datasets import concatenate_datasets, load_dataset from reasoning_commons import decontaminate, map_to_sharegpt datasets = [] # GeneralThought-Feb25 ds = load_dataset("GeneralReasoning/GeneralThought-Feb25", spt="train") ds = ds.map(  partial(  map_to_sharegpt,  question_col="question",  reasoning_col="model_reasoning",  solution_col="model_answer",  ),  num_proc=os.cpu_count(), ) datasets.append(ds) # Open-Thoughts-114k ds = load_dataset("open-thoughts/Open-Thoughts-114k", "metadata", spt="train") ds = ds.rename_column("problem", "question") ds = ds.map(  partial(  map_to_sharegpt,  question_col="question",  reasoning_col="deepseek_reasoning",  solution_col="deepseek_solution",  ),  num_proc=os.cpu_count(), ) datasets.append(ds) # Combine into final mix ds = concatenate_datasets(datasets) ds = decontaminate(ds) ds = ds.select_columns(["conversations"]) # Push to hub dataset_name = os.path.sptext(os.path.basename(__file__))[0] ds.push_to_hub(f"-dev/{dataset_name}") 