lr: 1.e-3 # 3.e-3 # 1.e-2 # 
lr_decay_fn: "cosine" # "constant"
#train_steps: 10
batch_size: 64
epochs: 40
pool: "mean"
weight_decay: 0.04 # 0.05
dataset_name: "listops" # "pathfinder128" # "imdb" #"aan"
dropout: 0.0
warmup_pc: 0
attention_type: "latte_convQR_bid" # ""latte_mach_sliding_bid" # latte_bid"
block_type: "transformer-sota" # "glu" #  # "transformer"
embed_type: "nope" #"rope" # "absolute" # "rope"
batchnorm: False
prenorm: True
hidden_dim: 128
nlayers: 8
state_dim: 128
att_block_len: 128
nheads: 4 #
L: 40
unroll: 100
max_seq_len: 2048
pos_embed_max_len: 2048
num_classes: 10
eval_steps: 0 # every iter
project: "lra_lattev2"
entity: "baesian-learning"
wandb_log: True # False # 
disable_cache: False
tokenizer_path: "/home/user/latte_trans/data/tokenizers/tok_list_ops.json" # '/home/ubuntu/latte_trans/data/tokenizers/tok_list_ops.json'
