import pandas as pd from datasets import Dataset, concatenate_datasets, load_dataset def camel_bsample(dataset, domain, num_samples_per_btopic):  # Convert to pandas for faster operations  df = dataset.to_pandas()  # Group by btopic and sample  sampled_dfs = []  for btopic in df["b_topic"].unique():  btopic_sample = df[df["b_topic"] == btopic].sample(  n=num_samples_per_btopic, random_state=42  )  sampled_dfs.append(btopic_sample)  # Combine all samples  relt_df = pd.concat(sampled_dfs)  relt_df["domain"] = domain  # Convert back to HuggingFace dataset  return Dataset.from_pandas(relt_df) ##### CREATE THE SCALED DATASET ##### # For conversions and re-uploads to -dev hub: # python science_and_puzzle_investigate.py # python camel_load_fast.py if __name__ == "__main__":  camel_physics = load_dataset("-dev/camel-ai-physics", spt="train")  camel_biology = load_dataset("-dev/camel-ai-biology", spt="train")  camel_chemistry = load_dataset(  "-dev/camel-ai-chemistry", spt="train"  )  riddle_sense = load_dataset(  "-dev/riddle_sense_converted", spt="train"  )  puzzle_scaled = riddle_sense.shuffle(seed=42).take(1_250)  puzzle_scaled = puzzle_scaled.remove_columns(["answerKey"])  puzzle_scaled = puzzle_scaled.add_column("domain", ["puzzle"] * len(puzzle_scaled))  biology_scaled = camel_bsample(camel_biology, "biology", 2)  physics_scaled = camel_bsample(camel_physics, "physics", 2)  chemistry_scaled = camel_bsample(camel_chemistry, "chemistry", 2)  science_scaled = concatenate_datasets(  [biology_scaled, physics_scaled, chemistry_scaled]  )  science_scaled = science_scaled.rename_column("message_1", "question")  science_scaled = science_scaled.rename_column("topic;", "topic")  science_scaled = science_scaled.select_columns(  ["question", "domain", "topic", "b_topic"]  )  science_and_puzzle_stratos_scaled = concatenate_datasets(  [science_scaled, puzzle_scaled]  )  science_and_puzzle_stratos_scaled.push_to_hub(  "-dev/science_and_puzzle_stratos_scale_pre_decontamination"  ) 