save_path: "finefineweb10B"
dataset: "HuggingFaceFW/fineweb"
shard_size: 100000000
max_length: 1024
num_tokens: 10
text_column: "text"
split: "train"