import time

dataset = 'shakespeare'
init_from = 'gpt2'

out_dir = f'/out-lora-shakespeare-{init_from}-2wd'
eval_interval = 5
eval_iters = 40
wandb_log = False # feel free to turn on
wandb_project = 'shakespeare'
wandb_run_name = 'ft-' + str(time.time())

# only save checkpoints if the validation loss improves
always_save_checkpoint = False

# the number of examples per iter:
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
batch_size = 1
gradient_accumulation_steps = 32
max_iters = 200

# tunable wd
weight_decay = 0.23
delta = 0.99
decay_wd = True

# finetune at constant LR
learning_rate = 2e-4
decay_lr = False

device = 'mps'
compile = False
compute_grad_memory = True

# LoRA parameters
lora_rank = 8
lora_alpha = 32
lora_dropout = 0
