from datasets import load_dataset, concatenate_datasets import os from bespokelabs import curator from pydantic import BaseModel num_cpus = os.cpu_count() def map_to_sharegpt(x):  x["conversations"] = [  {"from": "user", "value": x["problem"]},  {  "from": "assistant",  "value": f"<think>\n{x.get('r1_reasoning_content', x.get('deepseek_reasoning'))}\n</think>\n\n{x.get('r1_response', x.get('deepseek_solution'))}",  },  ]  return x import random random.seed(0) datasets = [] ds = load_dataset("/pdf_unverified_organic_chem_data", spt="train") ds = ds.filter(  lambda x: x["r1_response"] is not None  and len(x["r1_response"]) > 1  and x["r1_reasoning_content"] is not None  and len(x["r1_reasoning_content"]) > 1 ) ds = ds.map(map_to_sharegpt, num_proc=os.cpu_count()) ds = ds.select_columns(["conversations"]) datasets.append(ds) ot_ds = load_dataset("open-thoughts/Open-Thoughts-114k", "metadata", spt="train") ot_ds = ot_ds.map(map_to_sharegpt, num_proc=os.cpu_count()) ot_ds = ot_ds.select_columns(["conversations"]) datasets.append(ds) datasets.append(ot_ds) combined = concatenate_datasets(datasets) # ds = llm(ds) combined.push_to_hub("/convos__SCP_org_chemistry") 