lr: 1.e-4
lr_decay_fn: "linear"
lr_end_value: 5.e-9 # 6.e-5 # used for linear decay only
train_steps:  1000000 #125000 # 2000 * 
warmup: 100
batch_size: 8 #=4*12 # 192 # =24*8 # 256 # 128 # 64 # depends on the number of devices available.
eval_num_batches: 3
grad_accumulation_steps: 1
#epochs: 100
weight_decay: 0.1
# dropout each layer
dropout: 0.1
prenorm: True
batchnorm: False
hidden_dim: 1024 # 768
nlayers: 12 
nheads: 16
L: 48 #  320 # 512 # 128 # 768 # 516
attention_type: "nope_stable_latte" # "nope_standard_causal" # "rot_stable_latte" #    "stable_latte" # 
block_type: "transformer"
pos_embed_max_len: 1024
max_seq_len: 220  #     context_len: int = 220
eval_num_batches: 3

min_train_len: 5
max_train_len: 50
min_eval_len:  48
max_eval_len: 50
eval_context_len: 220

eval_steps: 1000
shuffle_train: False
train_task: "copy"
eval_task: "copy"
model: "latte"
project: "rebuttal_copy"
entity: "baesian-learning"
# TODO: Note to a better self - Get rid of hardcoded path
wandb_log: True
disable_cache: False # True