# config for training the integrate model, borrowed from https://github.com/karpathy/nanoGPT/

out_dir = 'alpha_integrate/train/out-integrate'
eval_interval = 250 
eval_iters = 100
log_interval = 50 # don't print too too often

# only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'integrate'
wandb_run_name = 'integrate-gpt'

#dataset = ''
gradient_accumulation_steps = 1
batch_size = 256
block_size = 1024 # context of up to 1024 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 50000
lr_decay_iters = 50000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit larger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially
weight_decay = 0.1 

# on macbook also add
# device = 'cpu'  # run on cpu only
# compile = False # do not torch compile the model
