lr: 1.e-3
small_lr: 1.e-3
lr_decay_fn: "cosine" # "constant"
#train_steps: 2000
batch_size: 32 # 90
epochs: 200
dataset_name: "cifar10" #"pathfinder32" # "pathfinder128" # "imdb" #"aan"
normalize_img: False #
tokenize_img: False # True # 
conv_embed: False #True
pool: "mean"
# dropout each layer
dropout: 0.1
batchnorm: False
prenorm: True
attention_type: "latte_mach_sliding_bid" # "latte_convQR_bid" #   latte_bid"
block_type: "transformer-sota" # "glu" #  # "transformer"
embed_type: "absolute" # "nope" #"rope" #  "rope"
att_block_len: 128
weight_decay: 0.05
hidden_dim:  512
nlayers: 6
nheads: 4 #
max_seq_len: 1024 # 4000 # 2000 #16384 # 
L: 40 #512
unroll: 100
num_classes: 10
eval_steps: 0
L: 40 # 128
unroll: 100
eval_steps: 0 # every iter
project: "lra_lattev2"
entity: "baesian-learning"
wandb_log: True # False # 
disable_cache: False
