num_heads: 12
embed_dim: 384
betas: [0.9, 0.95]
weight_decay: 0.01
lr: 5.0e-4
batch_size: 64
accumulate_grad_batches: 1
num_workers: 1
gradient_clip_val: 1.0
precision: "bf16-mixed"
num_nodes: 8
max_epochs: 200
num_iter: 4
beta: 0.995
tag: itr_attention
test_only: False
train_dataset_name: sudoku
test_dataset_name: sudoku-hard
check_val_every_n_epoch: 1
resume: False
use_compile: True
use_best: True
min_votes_exponent: 0
max_votes_exponent: 12
num_remain_grad: 2
update_after_step: 100
update_every: 10
test_batch_size: 16
ffn_dim_multiplier: 4
use_cross_attn: True
num_rep_attn: 4
multi_votes_exp_exponent: 5
use_mpc: False
mpc_every: 4
use_transformer: False
num_layers: 1
no_truncation: False
confidence_type: "log_prob"
