train: HuggingFaceFW/fineweb-edu
valid: openwebtext-valid  #wikitext103
tokenizer_name_or_path: gpt2
cache_dir: /share/kuleshov/ssahoo/textdiffusion/data
wrap: True
streaming: True
insert_train_eos: True
insert_valid_eos: True