lr=3e-3
iterative_warmup_steps=0

TORCH_DISTRIBUTED_DEBUG=DETAIL CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "llama60m" --model_config configs/llama_60m.json --dataset_name openwebtext --lr $lr --batch_size 32 --total_batch_size 512 --num_training_steps 10000 --warmup_steps 1000 --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --only_save_last --log_to_file --save_dir checkpoints/ --only_save_last --neuro_glia_network --hidden_size 8 --nonlinear_function learnable_sigmoid --no_log --functional_modulator --channel_wise --scalar_wise
TORCH_DISTRIBUTED_DEBUG=DETAIL CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "llama130m" --model_config configs/llama_130m.json --dataset_name openwebtext --lr $lr --batch_size 64 --total_batch_size 512 --num_training_steps 20000 --warmup_steps 2000 --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --only_save_last --log_to_file --save_dir checkpoints/ --only_save_last --neuro_glia_network --hidden_size 8 --nonlinear_function learnable_sigmoid --no_log --functional_modulator --channel_wise --scalar_wise
TORCH_DISTRIBUTED_DEBUG=DETAIL CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "llama250m" --model_config configs/llama_250m.json --dataset_name openwebtext --lr $lr --batch_size 32 --total_batch_size 512 --num_training_steps 40000 --warmup_steps 4000 --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --only_save_last --log_to_file --save_dir checkpoints/ --only_save_last --neuro_glia_network --hidden_size 8 --nonlinear_function learnable_sigmoid --no_log --functional_modulator --channel_wise --scalar_wise


