train: openwebtext-train
valid: openwebtext-valid
tokenizer_name_or_path: gpt2
cache_dir: input_your_data_path
wrap: True
streaming: False
insert_train_eos: True
insert_valid_eos: True
train_ratio: 1.0
valid_ratio: 1.0