# # 6.4 (0.005), 5.9 for (0.01), 4.4 (0.1). But I think its generation is better. Especially for dialog
# pre_name='new_llama2-7b'; forward_type=7; lr=0.001; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.1; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=0; rank=5; start_layer=2; end_layer=30; step=2
# visible_device=1; run_flag=1 
# echo $pre_name
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_dt${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}"

# 5.7 for (0.01), 4.5 for (0.1), interesting
# pre_name='new_llama2-7b'; forward_type=8; lr=0.001; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.1; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=0; rank=5; start_layer=2; end_layer=30; step=2
# visible_device=1; run_flag=1
# echo $pre_name
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}/"

#  for (0.01), 4.4 for (0.1), only difference is rank=20
# pre_name='new_llama2-7b'; forward_type=8; lr=0.001; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.1; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=0; rank=20; start_layer=2; end_layer=30; step=2
# visible_device=1; run_flag=1
# echo $pre_name
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}/"

# # 4.9 (0.1) -> 4.4 (1.0)
# pre_name='new_llama2-7b'; forward_type=8; lr=0.001; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=901; rank=20; start_layer=2; end_layer=30; step=2
# visible_device=1; run_flag=1
# echo $pre_name
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}/"

# # # 3.8 (or 4.1) [4.3 tested in 9/7/24], so smaller learning rate is better! The output seems very good, quite strange
# pre_name='new_llama2-7b'; forward_type=8; lr=0.0005; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=902; rank=20; start_layer=2; end_layer=30; step=2
# visible_device=0; run_flag=0
# echo $pre_name
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}/"


# 7.8 omg, but the quality is not bad actually (0.01) # 5.7 for (0.1), bad hahaha
# pre_name='new_llama2-7b'; forward_type=7; lr=0.001; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.1; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=901; rank=20; start_layer=2; end_layer=30; step=2
# visible_device=1; run_flag=1
# echo $pre_name
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}/"

# 14.3, bad
# pre_name='new_llama2-7b'; forward_type=8; lr=0.001; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.01; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=902; rank=20; start_layer=2; end_layer=30; step=2
# visible_device=1; run_flag=1
# echo $pre_name
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}/"

# rank=20, ft=1443
# 7.0 (0.1 & 0.01)  -> 6.4 (0.1 & 0.02) -> 4.77 (1.0 & 0.1) -> 4.9 (1 more epoch)
# pre_name='new_llama2-7b'; forward_type=1443; lr=0.0005; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=1
# max_tokens=512; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=915; rank=20; start_layer=2; end_layer=30; step=2
# if_subtrain=0; subtrain_type=0; subtrain_downsample_rate=0.1
# visible_device=0; run_flag=1
# echo $pre_name 
# python llama_test.py --forward_type $forward_type --lr 0.00025 --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}/"

# pre_name='new_llama2-7b'; forward_type=8; lr=0.001; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.01; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=922; rank=20; start_layer=2; end_layer=30; step=2
# if_subtrain=1; subtrain_type=1; subtrain_downsample_rate=0.001
# visible_device=0; run_flag=1
# echo $pre_name 
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}/"

# # 6.6
# pre_name='new_llama2-7b'; forward_type=8; lr=0.0005; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.01; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=923; rank=20; start_layer=2; end_layer=30; step=2
# if_subtrain=1; subtrain_type=1; subtrain_downsample_rate=0.001
# visible_device=0; run_flag=1
# echo $pre_name 
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}/"

# # 4.5
# pre_name='new_llama2-7b'; forward_type=8; lr=0.0005; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=922; rank=20; start_layer=2; end_layer=30; step=2
# if_subtrain=1; subtrain_type=1; subtrain_downsample_rate=0.2
# visible_device=0; run_flag=1
# echo $pre_name 
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir "xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}/"

# # 3.0
# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.0001; n_epochs=2; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=1.0; dataset_name=alpaca-gpt4; test_type=alpaca-gpt4
# info_tuples_type=0; rank=400; start_layer=2; end_layer=30; step=2
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=100000; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=alpaca-gpt4; warmup_downsample_rate=0.1
# visible_device=4; run_flag=1
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir

# # not so good
# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.00005; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=1024; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=970; rank=399; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=0; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=0; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model

# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.00005; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=1024; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=973; rank=399; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=0; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=0; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model


# # # 100k-200k, lr=0.00002, 896, grad_acc=2, +add eos_token, ppl=12.39
# # # # warmup ppl: 4.76 -> 3.74 (0.1) -> 3.18 (1.0) [better than original 3.3], 399 is for weight_decay=0.1, type=1. 
# # # New warmup! We can see the diversity and content of output is much better!
# # # And one epoch is enough. lr = 5e-5 is fine
# # 11.8
# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.00002; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=1
# max_tokens=896; downsample_rate=1.0; dataset_name=fineweb; test_type=fineweb
# info_tuples_type=0; rank=399; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=0; warmup_dataset_type=2; warmup_random_size=61; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=0; run_flag=1
# if_store_whole_model=1
# original_skip_size=100000
# use_stored_dir="xxx/llama/new/current_best/"
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model --original_skip_size $original_skip_size --use_stored_dir $use_stored_dir

# warmup (7.22, not good...) -> 4.04 (0.1)
# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.00002; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=1024; downsample_rate=0.1; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=979; rank=399; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=0; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=0; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model

# # warmup (5.52, not good...) -> 3.75 (0.1) -> 3.28 (1.0)
# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.00002; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=1024; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=977; rank=399; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=0; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=1; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model


# # warmup ppl: 4.76 -> 3.74 (0.1) -> 3.18 (1.0) [better than original 3.3], 399 is for weight_decay=0.1, type=1. 
# # New warmup! We can see the diversity and content of output is much better!
# # And one epoch is enough. lr = 5e-5 is fine
# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.00005; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0; if_store_baseline=1
# max_tokens=512; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=0; rank=399; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=100000; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=7; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model --if_store_baseline $if_store_baseline

# # warmup ppl: 4.76 -> 3.74 (0.1) -> 3.18 (1.0) [better than original 3.3], 399 is for weight_decay=0.1, type=1. 
# # New warmup! We can see the diversity and content of output is much better!
# # And one epoch is enough. lr = 5e-5 is fine

# mul
# # wamrupppl -> 4.73
# pre_name='warmup_llama2-7b'; forward_type=-1; lr=0.00002; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=0; rank=391; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=0; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=0; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model




# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.001; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=1.0; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=0; rank=5; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=0; warmup_dataset_type=2; warmup_random_size=112; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=2; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model

# pre_name='online_warmup_bf_llama2-7b'; forward_type=8; lr=0.0005; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.1; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=902; rank=32; start_layer=2; end_layer=30; step=2
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_online=1
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=11197; warmup_batch_size=8; warmup_n_epochs=2; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=1.0; warmup_text_batch_size=4; subset_num=5
# visible_device=0; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model \
#     --if_online $if_online --warmup_downsample_rate $warmup_downsample_rate --warmup_dataset_names $warmup_dataset_names --warmup_text_batch_size $warmup_text_batch_size --subset_num $subset_num

# pre_name='online_warmup_bf_llama2-7b'; forward_type=8; lr=0.0005; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=0.1; dataset_name=arxiv-math; test_type=arxiv-math
# info_tuples_type=902; rank=20; start_layer=2; end_layer=30; step=2
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_online=1
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=11211; warmup_batch_size=8; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1; warmup_text_batch_size=4; subset_num=1
# visible_device=0; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model \
#     --if_online $if_online --warmup_downsample_rate $warmup_downsample_rate --warmup_dataset_names $warmup_dataset_names --warmup_text_batch_size $warmup_text_batch_size --subset_num $subset_num

# 3.62
# pre_name='ano_warmup_llama2-7b'; forward_type=8; lr=0.00005; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=0
# max_tokens=512; downsample_rate=1.0; dataset_name=arxiv-math::alpaca-gpt4::dialogsum::databricks-dolly-15k; test_type=arxiv-math
# info_tuples_type=980; rank=400; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=1; warmup_dataset_type=2; warmup_random_size=100000; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=7; run_flag=1
# if_store_whole_model=1
# use_stored_dir=""
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# # warmup_model_dir="xxx/llama/new/ft_init_copy/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model


# pre_name='warmup_llama2-7b'; forward_type=8; lr=0.00002; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=1
# max_tokens=512; downsample_rate=1.0; dataset_name=fineweb-edu; test_type=fineweb-edu
# info_tuples_type=981; rank=400; start_layer=2; end_layer=30; step=2 # 399 is for test
# if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
# if_warmup=0; warmup_dataset_type=2; warmup_random_size=72727270; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
# warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
# visible_device=7; run_flag=1
# if_store_whole_model=1
# original_skip_size=0
# use_stored_dir="xxx/llama/new/ano_warmup_llama2-7b_ft8_lr0.00005_bs1_dsr1.0_ml_arxiv-math::alpaca-gpt4::dialogsum::databricks-dolly-15k_itt981_r400_s2_e30_2_stt1_stdsr0.001_wt2_wrs100000_wn5_wlr0.001/"
# store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
# warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
# store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
# echo $store_model_dir 
# CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
#     --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
#     --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
#     --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
#     --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
#     --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model --original_skip_size $original_skip_size --use_stored_dir $use_stored_dir

# 13.780180931091309
pre_name='warmup_llama2-7b'; forward_type=8; lr=0.00002; n_epochs=1; batch_size=1; batch_size_val=1; use_stored=1
max_tokens=1024; downsample_rate=1.0; dataset_name=fineweb-edu; test_type=fineweb-edu
info_tuples_type=981; rank=400; start_layer=2; end_layer=30; step=2 # 399 is for test
if_subtrain=0; subtrain_type=1; subtrain_downsample_rate=0.001
if_warmup=0; warmup_dataset_type=2; warmup_random_size=72727270; warmup_batch_size=1000; warmup_n_epochs=5; warmup_lr=0.001
warmup_dataset_names=arxiv-math; warmup_downsample_rate=0.1
visible_device=2; run_flag=1
if_store_whole_model=1
original_skip_size=100000
use_stored_dir="xxx/llama/new/ano_warmup_llama2-7b_ft8_lr0.00005_bs1_dsr1.0_ml_arxiv-math::alpaca-gpt4::dialogsum::databricks-dolly-15k_itt980_r400_s2_e30_2_stt1_stdsr0.001_wt2_wrs100000_wn5_wlr0.001/"
store_data_path="xxx/llama_reader_larger/${warmup_dataset_names}/ratio_${warmup_downsample_rate}/"
warmup_model_dir="xxx/llama/new/only_warmup_ft${forward_type}_r${rank}_wt${warmup_dataset_type}_wn${warmup_n_epochs}_wlr${warmup_lr}_${warmup_dataset_names}_${warmup_downsample_rate}"
store_model_dir="xxx/llama/new/${pre_name}_ft${forward_type}_lr${lr}_bs${batch_size}_dsr${downsample_rate}_ml${max_len}_${dataset_name}_itt${info_tuples_type}_r${rank}_s${start_layer}_e${end_layer}_${step}_stt${subtrain_type}_stdsr${subtrain_downsample_rate}_wt${warmup_dataset_type}_wrs${warmup_random_size}_wn${warmup_n_epochs}_wlr${warmup_lr}"
echo $store_model_dir 
CUDA_LAUNCH_BLOCKING=1 python llama_test.py --forward_type $forward_type --lr $lr --n_epochs $n_epochs --batch_size $batch_size --batch_size_val $batch_size_val --use_stored $use_stored \
    --downsample_rate $downsample_rate --max_tokens $max_tokens --dataset_name $dataset_name --test_type $test_type \
    --info_tuples_type $info_tuples_type --rank $rank --start_layer $start_layer --end_layer $end_layer --step $step \
    --if_subtrain $if_subtrain --subtrain_type $subtrain_type --subtrain_downsample_rate $subtrain_downsample_rate \
    --if_warmup $if_warmup --dataset_type $warmup_dataset_type --random_size $warmup_random_size --warmup_batch_size $warmup_batch_size --warmup_n_epochs $warmup_n_epochs --warmup_lr $warmup_lr --store_data_path $store_data_path --warmup_model_dir $warmup_model_dir \
    --visible_device $visible_device --run_flag $run_flag --store_model_dir $store_model_dir --if_store_whole_model $if_store_whole_model --original_skip_size $original_skip_size --use_stored_dir $use_stored_dir

