# ### 60M num_training_steps=10000, warmup_steps=1000 -model_config configs/llama60m.json
# ws_beta=0
# update_interval=100
# pruning_method="ri"
# remove_method="weight_magnitude_soft"
# sparsity_distribution="uniform"
# zeta=0.1
# iterative_warmup_steps=20

# model_config="configs/llama_60m.json"
# num_training_steps=10000
# warmup_steps=1000

# datasets=("openwebtext" "c4")



# lrs=(3e-3)
# las=(0)

# # sparsities=(0.95 0.95 0.9 0.85 0.95 0.9 0.85 0.8 0.75)
# # ranks=(24 72 48 24 120 96 72 48 24)
# sparsities=(0.95 0.9 0.85)
# ranks=(16 32 48)
# las=(32)

# for dataset in "${datasets[@]}"
# do
#     for lr in "${lrs[@]}"
#     do
#         for la in "${las[@]}"
#         do
#             for ((i=0; i<${#sparsities[@]}; i++)); do
#                 # static
#                 HF_HOME="/data/hf_cache" HF_DATASETS_OFFLINE=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "static_sp${sparsities[i]}+r${ranks[i]}+la$la" --model_config $model_config --dataset_name $dataset --lr $lr --batch_size 64 --total_batch_size 512 --num_training_steps $num_training_steps --warmup_steps $warmup_steps --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --update_interval $update_interval --sparsity ${sparsities[i]} --only_save_last --dst_scheduler --static_dst --no_log --log_to_file --save_dir checkpoints/ --only_save_last --sltrain --rank ${ranks[i]} --lora_alpha $la --no_compute_similarity --scheduler cosine

#             done
#         done
#     done

# done

# ### 130M num_training_steps=40000, warmup_steps=10000 -model_config configs/llama130m.json
# ws_beta=0
# update_interval=100
# pruning_method="ri"
# remove_method="weight_magnitude_soft"
# sparsity_distribution="uniform"
# zeta=0.1
# iterative_warmup_steps=20

# model_config="configs/llama_130m.json"
# num_training_steps=20000
# warmup_steps=2000

# datasets=("openwebtext" "c4")



# lrs=(3e-3)
# las=(0)

# sparsities=(0.95 0.9 0.85)
# ranks=(24 48 72)
# las=(16)

# for dataset in "${datasets[@]}"
# do
#     for lr in "${lrs[@]}"
#     do
#         for la in "${las[@]}"
#         do
#             for ((i=0; i<${#sparsities[@]}; i++)); do
#                 # static
#                 HF_HOME="/data/hf_cache" HF_DATASETS_OFFLINE=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "static_sp${sparsities[i]}+r${ranks[i]}+la$la" --model_config $model_config --dataset_name $dataset --lr $lr --batch_size 64 --total_batch_size 512 --num_training_steps $num_training_steps --warmup_steps $warmup_steps --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --update_interval $update_interval --sparsity ${sparsities[i]} --only_save_last --dst_scheduler --static_dst --no_log --log_to_file --save_dir checkpoints/ --only_save_last --sltrain --rank ${ranks[i]} --lora_alpha $la --no_compute_similarity --scheduler cosine

#             done
#         done
#     done

# done



# ### 60M num_training_steps=10000, warmup_steps=1000 -model_config configs/llama60m.json
# ws_beta=0
# update_interval=100
# pruning_method="ri"
# remove_method="weight_magnitude_soft"
# sparsity_distribution="uniform"
# zeta=0.1
# iterative_warmup_steps=20

# model_config="configs/llama_60m.json"
# num_training_steps=10000
# warmup_steps=1000

# datasets=("c4")



# lrs=(3e-3)
# las=(0)

# # sparsities=(0.95 0.95 0.9 0.85 0.95 0.9 0.85 0.8 0.75)
# # ranks=(24 72 48 24 120 96 72 48 24)
# sparsities=(0.95 0.95 0.95)
# ranks=(16 48 80)
# las=(32)

# for dataset in "${datasets[@]}"
# do
#     for lr in "${lrs[@]}"
#     do
#         for la in "${las[@]}"
#         do
#             for ((i=0; i<${#sparsities[@]}; i++)); do
#                 # static
#                 HF_HOME="/data/hf_cache" HF_DATASETS_OFFLINE=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "static_sp${sparsities[i]}+r${ranks[i]}+la$la" --model_config $model_config --dataset_name $dataset --lr $lr --batch_size 64 --total_batch_size 512 --num_training_steps $num_training_steps --warmup_steps $warmup_steps --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --update_interval $update_interval --sparsity ${sparsities[i]} --only_save_last --dst_scheduler --static_dst --no_log --log_to_file --save_dir checkpoints/ --only_save_last --sltrain --rank ${ranks[i]} --lora_alpha $la --no_compute_similarity --scheduler cosine

#             done
#         done
#     done

# done

# ### 130M num_training_steps=40000, warmup_steps=10000 -model_config configs/llama130m.json
# ws_beta=0
# update_interval=100
# pruning_method="ri"
# remove_method="weight_magnitude_soft"
# sparsity_distribution="uniform"
# zeta=0.1
# iterative_warmup_steps=20

# model_config="configs/llama_130m.json"
# num_training_steps=20000
# warmup_steps=2000

# datasets=("c4")



# lrs=(3e-3)
# las=(0)

# sparsities=(0.95 0.95 0.95)
# ranks=(24 72 120)
# las=(16)

# for dataset in "${datasets[@]}"
# do
#     for lr in "${lrs[@]}"
#     do
#         for la in "${las[@]}"
#         do
#             for ((i=0; i<${#sparsities[@]}; i++)); do
#                 # static
#                 HF_HOME="/data/hf_cache" HF_DATASETS_OFFLINE=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "static_sp${sparsities[i]}+r${ranks[i]}+la$la" --model_config $model_config --dataset_name $dataset --lr $lr --batch_size 64 --total_batch_size 512 --num_training_steps $num_training_steps --warmup_steps $warmup_steps --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --update_interval $update_interval --sparsity ${sparsities[i]} --only_save_last --dst_scheduler --static_dst --no_log --log_to_file --save_dir checkpoints/ --only_save_last --sltrain --rank ${ranks[i]} --lora_alpha $la --no_compute_similarity --scheduler cosine

#             done
#         done
#     done

# done




# ### 60M num_training_steps=10000, warmup_steps=1000 -model_config configs/llama60m.json
# ws_beta=0
# update_interval=100
# pruning_method="ri"
# remove_method="weight_magnitude_soft"
# sparsity_distribution="uniform"
# zeta=0.1
# iterative_warmup_steps=20

# model_config="configs/llama_60m.json"
# num_training_steps=10000
# warmup_steps=1000

# datasets=("openwebtext" "c4")



# lrs=(3e-3)
# las=(0)

# sparsities=(0.95 0.95 0.9 0.85 0.95 0.9 0.85 0.8 0.75)
# ranks=(16 48 32 16 80 64 48 32 16)
# las=(32)

# for dataset in "${datasets[@]}"
# do
#     for lr in "${lrs[@]}"
#     do
#         for la in "${las[@]}"
#         do
#             for ((i=0; i<${#sparsities[@]}; i++)); do
#                 # static
#                 HF_HOME="/data/hf_cache" HF_DATASETS_OFFLINE=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "static_sp${sparsities[i]}+r${ranks[i]}+la$la" --model_config $model_config --dataset_name $dataset --lr $lr --batch_size 64 --total_batch_size 512 --num_training_steps $num_training_steps --warmup_steps $warmup_steps --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --update_interval $update_interval --sparsity ${sparsities[i]} --only_save_last --dst_scheduler --static_dst --no_log --log_to_file --save_dir checkpoints/ --only_save_last --sltrain --rank ${ranks[i]} --lora_alpha $la --no_compute_similarity --scheduler cosine

#             done
#         done
#     done

# done

### 130M num_training_steps=40000, warmup_steps=10000 -model_config configs/llama130m.json
ws_beta=0
update_interval=100
pruning_method="ri"
remove_method="weight_magnitude_soft"
sparsity_distribution="uniform"
zeta=0.1
iterative_warmup_steps=20

model_config="configs/llama_130m.json"
num_training_steps=20000
warmup_steps=2000

datasets=("c4")



lrs=(3e-3)
las=(0)

# sparsities=(0.95 0.95 0.9 0.85 0.95 0.9 0.85 0.8 0.75)
# ranks=(24 72 48 24 120 96 72 48 24)
sparsities=(0.75)
ranks=(24)
las=(16)

for dataset in "${datasets[@]}"
do
    for lr in "${lrs[@]}"
    do
        for la in "${las[@]}"
        do
            for ((i=0; i<${#sparsities[@]}; i++)); do
                # static
                HF_HOME="/data/hf_cache" HF_DATASETS_OFFLINE=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --standalone --nproc_per_node 8 torchrun_main.py --run_name "static_sp${sparsities[i]}+r${ranks[i]}+la$la" --model_config $model_config --dataset_name $dataset --lr $lr --batch_size 64 --total_batch_size 512 --num_training_steps $num_training_steps --warmup_steps $warmup_steps --weight_decay 0 --dtype bfloat16 --eval_every 1000 --optimizer adam --iterative_warmup_steps $iterative_warmup_steps --update_interval $update_interval --sparsity ${sparsities[i]} --only_save_last --dst_scheduler --static_dst --no_log --log_to_file --save_dir checkpoints/ --only_save_last --sltrain --rank ${ranks[i]} --lora_alpha $la --no_compute_similarity --scheduler cosine

            done
        done
    done

done