num_heads: 12
embed_dim: 384
betas: [0.9, 0.95]
weight_decay: 0.01
lr: 5.0e-4
batch_size: 64
accumulate_grad_batches: 2
num_workers: 1
gradient_clip_val: 1.0
precision: "bf16-mixed"
num_nodes: 8
max_epochs: 6000
num_iter: 32
beta: 0.995
tag: itr_attention
test_only: False
train_dataset_name: maze
test_dataset_name: maze
check_val_every_n_epoch: 1
resume: False
log_every_n_steps: 1
use_compile: True
use_best: False
update_after_step: 10
update_every: 1
test_batch_size: 1
num_remain_grad: 4
use_cross_attn: True
num_rep_attn: 4
max_votes_exponent: 10
use_mpc: False
mpc_every: 4
no_truncation: False
multi_votes_exp_exponent: 6
confidence_type: "log_prob"