num_heads: 12
embed_dim: 384
betas: [0.9, 0.95]
weight_decay: 0.01
lr: 5.0e-4
batch_size: 64
accumulate_grad_batches: 1
num_workers: 4
gradient_clip_val: 1.0
precision: "bf16-mixed"
num_nodes: 8
max_epochs: 10
num_iter: 16
beta: 0.995
tag: itr_attention
test_only: False
train_dataset_name: sudoku-extreme
test_dataset_name: sudoku-extreme
check_val_every_n_epoch: 1
resume: False
use_compile: True
use_best: False
update_after_step: 100
update_every: 10
test_batch_size: 8
num_remain_grad: 2
use_cross_attn: True
num_rep_attn: 4
max_votes_exponent: 10
multi_votes_exp_exponent: 5
confidence_type: "log_prob"