lr: 1.e-3
small_lr: 1.e-3 # 0.001 # 
lr_decay_fn: "cosine" # "constant" # 
warmup_pc: 0.0
#train_steps: 20000
batch_size: 32 # 36 # 90
epochs: 32
dataset_name: "imdb" # "listops" # "pathfinder128" # "aan"
pool: "mean"
# dropout each layer
dropout: 0.1
weight_decay: 0.05
batchnorm: False
prenorm: True
attention_type: "latte_mach_sliding_bid" # "latte_convQR_bid" # "latte_bid"
block_type: "transformer-sota" # "glu" #  # "transformer"
embed_type: "nope" # "absolute" # "rope"
hidden_dim:  256
nlayers: 6
nheads: 4 #
max_seq_len: 4000 #16384 # 4000 # 2000 # 1024
pos_embed_max_len: 4000
num_classes: 2
L: 256
unroll: 100
eval_steps: 0
eval_steps: 0 # every iter
project: "lra_lattev2"
entity: "baesian-learning"
wandb_log: True # False # 
disable_cache: False