### Fixed arguments for the Wikitext data handler setup

# Dataset identifier for the HuggingFace Datasets
hf_dataset_id: DKYoon/SlimPajama-6B
# Dataset Subset name for the datasets with multiple subsets
hf_data_subset_name: ""
# HuggingFace repository ID for the model which we will use the tokenizer of
tokenizer_repo_id: openai-community/gpt2
# Forces the loader to load all the splits existing on the hub for the dataset and 
# split them according to the `default_split_ratio`
force_splits: true
# Forces to process the dataset again if set
force_overwrite: false
# Valid subsample size indices are: `0`, `1`, `2`
#   `0` corresponds to the full parent dataset, `1` to 50%, `2` to 5%
subsample_index: 0
# Splits for the dataset
splits: ['train', 'validation']
# Default split ratio for the dataset
default_split_ratio: [0.9, 0.1]

### Arguments to update based on instantiation + required functions

# Root folder which holds tokenizers, binaries, cache folders
root_data_path: ./data
# Seed for splitting datasets and shuffling
seed: 42
# Preprocess Dataset before passing into tokenization, used in `Dataset.map(...)` function
preprocess_fn: preprocess_wikitext
# Block size for data loading (Will be removed in the next iteration)
block_size: 1024