lr: 1.e-4 # 0.002 #
small_lr: 1.e-4 # 0.001 # 
lr_decay_fn: "cosine" # "constant" #
batch_size: 32 # 8 # 16 # 90
#train_steps: 20000
warmup_pc: 0.08 #0.25 # results in approx 7354 warmup steps
epochs: 20
dataset_name: "aan" # "pathfinder128" # "imdb" #"aan"
pool: "mean" # "CLS" #"last"
# dropout each layer
dropout: 0.1
weight_decay: 0.01
batchnorm: False
prenorm: True
attention_type: "latte_mach_sliding_bid" # "latte_convQR_bid" # "latte_bid"
block_type: "transformer-sota" # "glu" #  # "transformer"
embed_type: "nope" # "absolute" # "rope"
hidden_dim:  128 #256
nlayers: 6
nheads: 4 #
max_seq_len: 4000 #16384 # 4000 # 2000 # 1024
pos_embed_max_len: 4000
num_classes: 2
L: 40 # 128
unroll: 100
eval_steps: 0 # every iter
project: "lra_lattev2"
entity: "baesian-learning"
wandb_log: True # False # 
disable_cache: False