# used to train on the book corpus
lr: 6.e-4 #0.00025
lr_decay_fn: "cosine"
lr_end_value: 6.e-9 # used for linear decay only
train_steps: 300000 #20000 # 100000 # 10000 # 800000 # 600000*grad_accum
# warmup_pc: # 0.025 # rought 15000 for 
warmup: 1000
batch_size: 4 # 60 # 4 # 4 # 12 #  80 # 320 #160 # 80 #640 # 80 #160 #=4*12 # 192 # =24*8 # 256 # 128 # 64 # depends on the number of devices available.
shuffle_train: True # False
grad_accumulation_steps: 1
mixed_precision: "bf16" # "no" #
#epochs: 100
weight_decay: 0.01
dataset_name: "bookcorpus" # "owt"
# dropout each layer
dropout: 0.0
dropout_att: 0.0
prenorm: True
batchnorm: False
hidden_dim: 512 #  512 # 16 # 32 # 512 # 1024 # 
intermediate_dim: 2048
nlayers: 12 #12 # 6 # 8
nheads: 8 # 2
L: 128 # 64 # 512 # 768 # 516
state_dim: 128
att_block_len: 128
embed_type: "rope"
attention_type: "latte_mach_sliding_causal" # "standard_causal" #  "latte_convQR_causal" # "latte_causal" # "xpos_latte_mach_sliding" 
block_type: "gemma" # "griffin" # "mamba" # "transformer-sota" # "transformer" # "transformer-qual" # "mega" # "linear-transformer" # "transformer-sota" #  "rwkv" #  "lamma" #
unroll: 1024
eval_samples: 100 # 40 # average over 30 batches
eval_gen_len: 50
max_seq_len: 16000 # 5120 # 16000 # 1024 #   
pos_embed_max_len:  5120 #  1024 #
eval_steps: 1000 # 2000 *grad_accum
project: "seqlen-gemma-mach" #  "latte-sota-dev" # 
entity: "baesian-learning"
# TODO: Note to a better self - Get rid of hardcoded path
promt_path: "/home/user/latte/latte/experiments/config/promts.txt" 
check_path: "/data_user/data/out_latte/bookcorp_seqlen_mach/checkpoints" #  "/data_user/data/out_latte/bookcorp_seqlen_att/checkpoints" #  "/data_user/data/out_latte/bookcorpus_gemma_machv2/checkpoints" # "/data_user/data/out_latte/bookcorpus_gemma_full_att/checkpoints" #  "/data_user/data/out_latte/mamba2" #"/data_user/data/out_latte/owt_256l_h2/checkpoints"
# run_id: "3qmfzbkb"
wandb_log: False #  True #  
disable_cache: False #True #  