num_heads: 12
embed_dim: 384
betas: [0.9, 0.95]
weight_decay: 0.01
lr: 5.0e-4
batch_size: 64
accumulate_grad_batches: 1
num_workers: 4
gradient_clip_val: 1.0
precision: "bf16-mixed"
num_nodes: 8
max_epochs: 10
num_iter: 32
beta: 0.995
tag: itr_attention
test_only: False
use_deq: False
train_dataset_name: arc
test_dataset_name: arc
check_val_every_n_epoch: 1
resume: False
use_compile: True
use_best: False
update_after_step: 10
update_every: 1
test_batch_size: 8
num_remain_grad: 8
use_cross_attn: True
num_rep_attn: 4
max_votes_exponent: 10
num_layers: 1