# used to create the time measurements for large models.
lr: 6.e-4 # 2.e-5 #0.00025
lr_decay_fn: "cosine"
lr_end_value: 6.e-9 # used for linear decay only
train_steps: 100000 # 10000 # 800000 # 600000*grad_accum
# warmup_pc: # 0.025 # rought 15000 for 
warmup: 2000
batch_size: 4 #  8 # 16 # 32 # 
shuffle_train: True
grad_accumulation_steps: 1
mixed_precision: "bf16" # "no" #
#epochs: 100
weight_decay: 0.01
dataset_name: "pajama" # "bookcorpus" #    "tiny-stories"
hugg_chk: "google/gemma-2-2b" # "google/gemma-2-9b" # "meta-llama/Llama-2-7b-hf"
block_type: "gemma" # "gemma-hugg" #  "lamma"
embed_type: "rope"
attention_type: "latte_mach_sliding_causal" # "standard_causal" # 
L: 128 # 64 # 512 # 768 # 516
state_dim: 128
att_block_len: 128 # 1024 # 
unroll: 1024
eval_gen_len: 50
max_seq_len: 32000 # 16000 #  8000 # 4000 # 2000 # 1000 #  8000 # 2048 # 4096 # 16000 #  4096 #  1024 # 2048 #
pos_embed_max_len: 32000 # 8000 # 512 # 1024 # 2048
eval_samples: 100 # 40 # average over 30 batches
eval_steps: 1000 # 2000 #2000 *grad_accum
max_checkpoints: 2
project: "gemma-mach" # "seqlen-gemma-mach" #  "latte-sota-dev" # 
entity: "baesian-learning"
#check_path: "/data_user/data/out_latte/test_gemma/checkpoints" # "/data_user/data/out_latte/gemma_mach_interp_5K/checkpoints" # "/data_user/data/out_latte/gemma_mach_interp_finetune/checkpoints"
#run_id: "gtbb6lv6"
wandb_log:  False #  True # 
disable_cache:  False # 

####################### Overridden configs ################################
# dropout each layer
dropout: 0.0
dropout_att: 0.0
prenorm: True
batchnorm: False
hidden_dim: 768 #  512 # 16 # 32 # 512 # 1024 #
intermediate_dim: 3072
nlayers: 22 # 12 #12 # 6 # 8
nheads: 8 # 2
num_key_value_heads: 4
