#LEARNABLE
NVIDIA_TF32_OVERRIDE=0 JAX_DEFAULT_MATMUL_PRECISION=highest CUBLAS_WORKSPACE_CONFIG=:4096:8 WANDB_MODE=online CUDA_VISIBLE_DEVICES=0  python src/lgmodeling/finetune.py \
    --model-path /root/weights/wt103/lr0.00025-learnable-step60000-warm2000-size64-layer12-embd192-heads3-shared1-routed0-topk0/best_60000 \
    --seed 0 --tgt_len 256 --mem_len 0 --eval_tgt_len 256 --n_head 3 --lmc-layer-indices 0 \
    --learning-rate 0.00025 --batch-size 64 --max_step 60000 --warmup_step 2000 --dataset wt103 \
    --model-save-dir /root/weights/wt103/gpt2-finetune --data-path /root/datasets/wt103

#LEARNABLE
NVIDIA_TF32_OVERRIDE=0 JAX_DEFAULT_MATMUL_PRECISION=highest CUBLAS_WORKSPACE_CONFIG=:4096:8 WANDB_MODE=online CUDA_VISIBLE_DEVICES=0  python src/lgmodeling/finetune.py \
    --model-path /root/weights/wt103/lr0.00025-learnable-step60000-warm2000-size64-layer12-embd192-heads3-shared1-routed0-topk0/best_60000 \
    --seed 0 --tgt_len 256 --mem_len 0 --eval_tgt_len 256 --n_head 3 --lmc-layer-indices 0 \
    --learning-rate 0.00025 --batch-size 64 --max_step 60000 --warmup_step 2000 --dataset wt103 \
    --model-save-dir /root/weights/wt103/gpt2-finetune --data-path /root/datasets/wt103

