from datasets import load_dataset import os ds_ot = load_dataset("-dev/openthoughts3_300k", spt="train") ds_nemo = load_dataset("-dev/nemo_nano_300k", spt="train") ds_ot = ds_ot.map(lambda x: {"conv_char_length": len(str(x['conversations']))}, num_proc=os.cpu_count()) ds_nemo = ds_nemo.map(lambda x: {"conv_char_length": len(str(x['conversations']))}, num_proc=os.cpu_count()) import numpy as np print("OpenThoughts3 mean conversation length:", np.mean(ds_ot["conv_char_length"])) print("NeMo mean conversation length:", np.mean(ds_nemo["conv_char_length"]))