lr: 0.001
small_lr: 0.0001
#train_steps: 10
batch_size: 64 # 90
epochs: 200
dataset_name: "pathfinder32" # "pathfinder128" # "imdb" #"aan"
conv_embed: False
pool: "mean"
weight_decay: 0.03
# dropout each layer
dropout: 0.05
dropout_att: 0.0
batchnorm: False
prenorm: True
attention_type:  "latte_mach_sliding_bid" # "latte_convQR_bid" #    "latte_bid"
block_type: "transformer-sota" # "glu" #  # "transformer"
embed_type: "absolute" # "nope" # "rope" # "absolute" # "rope"
hidden_dim: 192 # 256 # 
att_block_len: 128
nlayers: 6
nheads: 4 #
max_seq_len: 1024 # 4000 # 2000 #16384 # 
L: 40 # 256 # 
state_dim: 128
blocks: 8
unroll: 100
num_classes: 2
eval_steps: 0
project: "lra_lattev2"
entity: "baesian-learning"
wandb_log: True #
disable_cache: False