# default: config values
log_interval: 3
eval_itervals: 5
always_save_checkpoint: 8000
model: 'GeST'
bin_type: 'nobin'


codebook: './metacell/codebook3000.npy'

output_dir: "demo"
comment:  "ICLR" 
init_from: "scratch"
train_mode: 'scratch' # when init_from is not scratch, train model is set to "finetune". 'frozenAE' 'finetune' 'scratch'

epoch: 100

round0: 0.3 
round1: 0.3 
block_size: 768 
rope_base: 0.5
loc_emb: 'sinu' 

encoder: 'mlp' #'multimlp' #'mlp' #'scimilarity'
decoder: 'mlp' 

#dataset
zscore: False # False

# DDP settings
backend: 'nccl'
gradient_accumulation_steps: 2 ## total batch sample is batch_size*cardnumebr*gradient_accumulation_steps/cardnumber=batch_size*gradient_accumulation_steps
compile: False 

batch_size: 64 
idxshuffle: 'corner' #'random'



# model
vocab_size: 1122
n_embd: 768 #768
n_layer: 8 #8
n_head: 8 #8
dropout: 0.2
bias: False
task: 'quantize' #'msle' #'mse'
scale_lambda: 1
loss_len: 10
skipconnect: False
noise: 0

# adamw optimizer
learning_rate: 0.0005 #AE 0.001 #Trans 0.001
max_iters: 4000000 
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 2.0 

### Experimental Cosine Annealing Learning Rate
# learning rate decay settings
decay_lr: True
warmup_iters: 1000
lr_decay_iters: 500000 #r20 50000
min_lr: 5.0e-5
# system
device: 'cuda' 
dtype: 'bfloat16' 

# inference path
ckpt_path: None
infersave_path: None
N: 4