name: random-tokens
modality: text
task: causal-lm # masked-lm / causal-lm

# only used when task=masked-lm:
mlm_probability: 0.15
disable_mlm: False

path: "~/data"
size: 512
shape:
  - 32 # This is sequence_length

# Preprocessing
# re-use the wikitext dataset and this tokenizer:
tokenizer: GPT-2
vocab_size: 50257

# Federated Learning specifics:
default_clients: 1000 # estimate on number of articles in dataset
partition: given # use natural data partition
examples_from_split: training

# Data-specific implementation constants:
batch_size: 128
caching: False
defaults:
  - db: none # Database Setup # use the cmd-line to activate the LMDB module with data.db=LMDB
