#! /bin/sh

# ============================================ debug model =======================================================

# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 1 outputs --test debug_model --print_info True --offloadingbw -1

# =========================================== 8B =================================================================

# Identify critical tensors
# clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_128_2k_4ubs --print_info True --offloadingbw 0
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_64_1k_8ubs  --print_info True --offloadingbw 0
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_64_2k_4ubs  --print_info True --offloadingbw 0
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_64_3k_2ubs  --print_info True --offloadingbw 0
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_64_4k_2ubs  --print_info True --offloadingbw 0
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_16_2k_4ubs  --print_info True --offloadingbw 0
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_32_2k_4ubs  --print_info True --offloadingbw 0
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 70b_16_2k_1ubs  --print_info True --offloadingbw 0

# Trace gen
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_128_2k_4ubs --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_8b_128_2k_4ubs
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_64_1k_8ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_8b_64_1k_8ubs
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_64_2k_4ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_8b_64_2k_4ubs
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_64_3k_2ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_8b_64_3k_2ubs
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_64_4k_2ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_8b_64_4k_2ubs
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_16_2k_4ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_8b_16_2k_4ubs
# clear; rm -rf outputs/; ; python ./test_runner.py --ngpu 2 outputs --test 8b_32_2k_4ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_8b_32_2k_4ubs


# Offloading ideal case
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_128_2k_4ubs --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_128_2k_4ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_1k_8ubs  --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_1k_8ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_2k_4ubs  --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_2k_4ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_4k_2ubs  --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_4k_2ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_16_2k_4ubs  --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_16_2k_4ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_32_2k_4ubs  --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_32_2k_4ubs_gpu80

# Offloading real case
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_128_2k_4ubs --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_128_2k_4ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_1k_8ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_1k_8ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_2k_4ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_2k_4ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_4k_2ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_4k_2ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_16_2k_4ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_16_2k_4ubs_gpu80
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_32_2k_4ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_32_2k_4ubs_gpu80


# Mixed
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_128_2k_4ubs --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_128_2k_4ubs_gpu80_cpu85 &> logs/8b_128_2k_4ubs_gpu80_cpu85.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_1k_8ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_1k_8ubs_gpu80_cpu85 &> logs/8b_64_1k_8ubs_gpu80_cpu85.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_2k_4ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_2k_4ubs_gpu80_cpu85 &> logs/8b_64_2k_4ubs_gpu80_cpu85.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_64_4k_2ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_64_4k_2ubs_gpu80_cpu85 &> logs/8b_64_4k_2ubs_gpu80_cpu85.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_16_2k_4ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_16_2k_4ubs_gpu80_cpu85 &> logs/8b_16_2k_4ubs_gpu80_cpu85.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 8b_32_2k_4ubs  --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_8b_32_2k_4ubs_gpu80_cpu85 &> logs/8b_32_2k_4ubs_gpu80_cpu85.trace;



# =========================== 70B =================================

clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_8_2k_1ubs  --print_info True --offloadingbw 0
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_2k_1ubs --print_info True --offloadingbw 0

# Gen traces
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_8_2k_1ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_70b_8_2k_1ubs
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_2k_1ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_70b_16_2k_1ubs
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_1k_1ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_70b_16_1k_1ubs
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_32_2k_1ubs  --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_70b_32_2k_1ubs
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_3k_1ubs --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_70b_16_3k_1ubs
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_4k_1ubs --print_info False --offloadingbw 0 --liveness_path ../teraio-algorithm/data_70b_16_4k_1ubs

# Offloading ideal case
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_8_2k_1ubs --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_8_2k_1ubs_gpu78  &> logs/70b_8_2k_1ubs_ideal.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_2k_1ubs --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_16_2k_1ubs_gpu78 &> logs/70b_16_2k_1ubs_ideal.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_32_2k_1ubs --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_32_2k_1ubs_gpu78 &> logs/70b_32_2k_1ubs_ideal.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_1k_1ubs --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_16_1k_1ubs_gpu78 &> logs/70b_16_1k_1ubs_ideal.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_4k_1ubs --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_16_4k_1ubs_gpu78 &> logs/70b_16_4k_1ubs_ideal.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_3k_1ubs --print_info False --offloadingbw 0 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_16_3k_1ubs_gpu78 &> logs/70b_16_3k_1ubs_ideal.trace;

# Offloading real case
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_8_2k_1ubs --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_8_2k_1ubs_gpu78_cpu854 &> logs/70b_8_2k_1ubs_cpu854.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_2k_1ubs --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_16_2k_1ubs_gpu78 &> logs/70b_16_2k_1ubs_gpu78.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_32_2k_1ubs --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_32_2k_1ubs_gpu78 &> logs/70b_32_2k_1ubs_gpu78.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_1k_1ubs --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_16_1k_1ubs_gpu78 &> logs/70b_16_1k_1ubs_gpu78.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_3k_1ubs --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_16_3k_1ubs_gpu78 &> logs/70b_16_3k_1ubs_gpu78.trace;
clear; rm -rf outputs/; python ./test_runner.py --ngpu 2 outputs --test 70b_16_4k_1ubs --print_info False --offloadingbw -1 --plan_path ../teraio-algorithm/results/torchtitan_X_70b_16_4k_1ubs_gpu78 &> logs/70b_16_4k_1ubs_gpu78.trace;

# set auto-solib-add on
# clear; rm -rf outputs/; gdb --args python ./test_runner.py --ngpu 1 outputs



# =========================== Granite-8B-128k =================================

# Trace gen
clear; rm -rf outputs/;  python ./test_runner.py --ngpu 2 outputs --test granite_8b_bs16_seq4k_ubs4 --print_info True --offloadingbw 0 --liveness_path ../teraio-algorithm/data_granite_8b_bs16_seq4k_ubs4
clear; rm -rf outputs/;  python ./test_runner.py --ngpu 2 outputs --test granite_8b_bs32_seq4k_ubs4 --print_info True --offloadingbw 0 --liveness_path ../teraio-algorithm/data_granite_8b_bs32_seq4k_ubs4
clear; rm -rf outputs/;  python ./test_runner.py --ngpu 2 outputs --test granite_8b_bs64_seq4k_ubs4 --print_info True --offloadingbw 0 --liveness_path ../teraio-algorithm/data_granite_8b_bs64_seq4k_ubs4
clear; rm -rf outputs/;  python ./test_runner.py --ngpu 2 outputs --test granite_8b_bs64_seq8k_ubs2 --print_info True --offloadingbw 0 --liveness_path ../teraio-algorithm/data_granite_8b_bs64_seq8k_ubs2
clear; rm -rf outputs/;  python ./test_runner.py --ngpu 2 outputs --test granite_8b_bs64_seq2k_ubs8 --print_info True --offloadingbw 0 --liveness_path ../teraio-algorithm/data_granite_8b_bs64_seq2k_ubs8
clear; rm -rf outputs/;  python ./test_runner.py --ngpu 2 outputs --test granite_8b_bs128_seq4k_ubs4 --print_info True --offloadingbw 0 --liveness_path ../teraio-algorithm/data_granite_8b_bs128_seq4k_ubs4


