# !/bin/bash

# # gpt2 sst2
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train.py \
#           --tsqp true \
#           --model_name_or_path ./models/gpt2 \
#           --src_len 512 \
#           --tgt_len 128 \
#           --data_path ./data/sst2 \
#           --num_labels 2 \
#           --train_micro_batch_size_per_gpu 64 \
#           --gradient_accumulation_steps 2 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 3 \
#           --output_dir ./outputs/tsqp/gpt2/sst2/ \
#           --finetune_method full-tuning \
#           --lora_target_modules c_attn,c_proj,c_fc \
#           --ds_config_path ./config/ds_config.json \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # gpt2 mnli
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train.py \
#           --tsqp true \
#           --model_name_or_path ./models/gpt2 \
#           --src_len 512 \
#           --tgt_len 128 \
#           --data_path ./data/mnli \
#           --num_labels 3 \
#           --train_micro_batch_size_per_gpu 64 \
#           --gradient_accumulation_steps 2 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 3 \
#           --output_dir ./outputs/tsqp/gpt2/mnli/ \
#           --finetune_method full-tuning \
#           --lora_target_modules c_attn,c_proj,c_fc \
#           --ds_config_path ./config/ds_config.json \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # gpt2 qnli
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train.py \
#           --tsqp true \
#           --model_name_or_path ./models/gpt2 \
#           --src_len 512 \
#           --tgt_len 128 \
#           --data_path ./data/qnli \
#           --num_labels 2 \
#           --train_micro_batch_size_per_gpu 64 \
#           --gradient_accumulation_steps 2 \
#           --max_lr 1e-4 \
#           --initial_lr 2e-5 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 3 \
#           --output_dir ./outputs/tsqp/gpt2/qnli/ \
#           --finetune_method full-tuning \
#           --lora_target_modules c_attn,c_proj,c_fc \
#           --ds_config_path ./config/ds_config.json \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # gpt2 qqp
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train.py \
#           --tsqp true \
#           --model_name_or_path ./models/gpt2 \
#           --src_len 512 \
#           --tgt_len 128 \
#           --data_path ./data/qqp \
#           --num_labels 2 \
#           --train_micro_batch_size_per_gpu 64 \
#           --gradient_accumulation_steps 2 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-5 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 3 \
#           --output_dir ./outputs/tsqp/gpt2/qqp/ \
#           --finetune_method full-tuning \
#           --lora_target_modules c_attn,c_proj,c_fc \
#           --ds_config_path ./config/ds_config.json \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # qwen3 goemotions
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_goemotions.py \
#           --model_name_or_path ./models/qwen3-4b \
#           --data_path ./data/goemotions \
#           --tsqp true \
#           --src_len 64 \
#           --train_micro_batch_size_per_gpu 8 \
#           --gradient_accumulation_steps 2 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 1 \
#           --output_dir ./outputs/tsqp/qwen3/goemotions/ \
#           --finetune_method lora \
#           --lora_target_modules q_proj,k_proj,v_proj,o_proj \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # gemma3 goemotions
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_goemotions.py \
#           --model_name_or_path ./models/gemma3-1b-it \
#           --data_path ./data/goemotions \
#           --tsqp true \
#           --src_len 64 \
#           --train_micro_batch_size_per_gpu 16 \
#           --gradient_accumulation_steps 1 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 1 \
#           --output_dir ./outputs/tsqp/gemma3/goemotions/ \
#           --finetune_method lora \
#           --lora_target_modules q_proj,k_proj,v_proj,o_proj \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# llama3.2 goemotions
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_goemotions.py \
#           --model_name_or_path ./models/Llama-3.2-3B-Instruct \
#           --data_path ./data/goemotions \
#           --tsqp true \
#           --src_len 64 \
#           --tgt_len 128 \
#           --train_micro_batch_size_per_gpu 8 \
#           --gradient_accumulation_steps 2 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 1 \
#           --output_dir ./outputs/tsqp/llama3.2/goemotions/ \
#           --finetune_method lora \
#           --lora_target_modules q_proj,k_proj,v_proj,o_proj \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # qwen3 wic
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_wic.py \
#           --model_name_or_path ./models/qwen3-4b \
#           --data_path ./data/wic \
#           --tsqp true \
#           --num_labels 2 \
#           --src_len 512 \
#           --tgt_len 128 \
#           --train_micro_batch_size_per_gpu 4 \
#           --gradient_accumulation_steps 2 \
#           --max_lr 1e-4 \
#           --initial_lr 5e-5 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 4 \
#           --output_dir ./outputs/tsqp/qwen3/wic/ \
#           --finetune_method lora \
#           --lora_target_modules q_proj,k_proj,v_proj,o_proj \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # llama3.2 wic
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_wic.py \
#           --model_name_or_path ./models/Llama-3.2-3B-Instruct \
#           --data_path ./data/wic \
#           --tsqp true \
#           --num_labels 2 \
#           --src_len 512 \
#           --tgt_len 128 \
#           --train_micro_batch_size_per_gpu 4 \
#           --gradient_accumulation_steps 2 \
#           --max_lr 1e-4 \
#           --initial_lr 5e-5 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 4 \
#           --output_dir ./outputs/tsqp/llama3.2/wic/ \
#           --finetune_method lora \
#           --lora_target_modules q_proj,k_proj,v_proj,o_proj \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # gemma3 wic
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_wic.py \
#           --model_name_or_path ./models/gemma3-1b-it \
#           --data_path ./data/wic \
#           --tsqp true \
#           --num_labels 2 \
#           --src_len 512 \
#           --tgt_len 128 \
#           --train_micro_batch_size_per_gpu 8 \
#           --gradient_accumulation_steps 1 \
#           --max_lr 1e-4 \
#           --initial_lr 5e-5 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 4 \
#           --output_dir ./outputs/tsqp/gemma3/wic/ \
#           --finetune_method lora \
#           --lora_target_modules q_proj,k_proj,v_proj,o_proj \
#           --lora_alpha 32 \
#           --lora_dropout 0.05 \
#           --lora_r 16 \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # qwen3 pubmedqa
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_pubmedqa.py \
#           --model_name_or_path ./models/qwen3-4b \
#           --data_path ./data/pubmedqa/split/ \
#           --tsqp true \
#           --src_len 2048 \
#           --tgt_len 128 \
#           --train_micro_batch_size_per_gpu 1 \
#           --gradient_accumulation_steps 16 \
#           --stage 3 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 3 \
#           --output_dir ./outputs/tsqp/qwen3/pubmedqa/ \
#           --finetune_method full-tuning \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # pubmedqa llama3.2
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_pubmedqa.py \
#           --model_name_or_path models/Llama-3.2-3B-Instruct \
#           --data_path ./data/pubmedqa/split/ \
#           --tsqp true \
#           --src_len 2048 \
#           --tgt_len 128 \
#           --train_micro_batch_size_per_gpu 1 \
#           --gradient_accumulation_steps 16 \
#           --stage 3 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 3 \
#           --output_dir ./outputs/tsqp/llama3.2/pubmedqa/ \
#           --finetune_method full-tuning \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # pubmedqa gemma3
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_pubmedqa.py \
#           --model_name_or_path models/gemma3-1b-it \
#           --data_path ./data/pubmedqa/split/ \
#           --tsqp true \
#           --src_len 2048 \
#           --tgt_len 128 \
#           --train_micro_batch_size_per_gpu 1 \
#           --gradient_accumulation_steps 16 \
#           --stage 3 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 3 \
#           --output_dir ./outputs/tsqp/gemma3/pubmedqa/ \
#           --finetune_method full-tuning \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# # finqa qwen3
# deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_finqa.py \
#           --model_name_or_path ./models/qwen3-4b \
#           --data_path ./data/finqa \
#           --tsqp true \
#           --src_len 2048 \
#           --tgt_len 128 \
#           --train_micro_batch_size_per_gpu 1 \
#           --gradient_accumulation_steps 16 \
#           --stage 3 \
#           --max_lr 1e-4 \
#           --initial_lr 1e-6 \
#           --min_lr 1e-8 \
#           --weight_decay 0.01 \
#           --adam_beta1 0.9 \
#           --adam_beta2 0.999\
#           --epochs 3 \
#           --output_dir ./outputs/tsqp/qwen3/finqa/ \
#           --finetune_method full-tuning \
#           --ds_config_path ./config/ds_config.json \
#           --offload_device cpu \
#           --nvme_path ./mnt/nvme

# finqa llama3.2
deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_finqa.py \
          --model_name_or_path models/Llama-3.2-3B-Instruct \
          --data_path ./data/finqa \
          --tsqp true \
          --src_len 2048 \
          --tgt_len 128 \
          --train_micro_batch_size_per_gpu 1 \
          --gradient_accumulation_steps 16 \
          --stage 3 \
          --max_lr 1e-4 \
          --initial_lr 1e-6 \
          --min_lr 1e-8 \
          --weight_decay 0.01 \
          --adam_beta1 0.9 \
          --adam_beta2 0.999\
          --epochs 3 \
          --output_dir ./outputs/tsqp/llama3.2/finqa/ \
          --finetune_method full-tuning \
          --ds_config_path ./config/ds_config.json \
          --offload_device cpu \
          --nvme_path ./mnt/nvme

# finqa gemma3
deepspeed --include localhost:0,1,2,3 --master_port 12345 ./training/train_finqa.py \
          --model_name_or_path models/gemma3-1b-it \
          --data_path ./data/finqa/ \
          --tsqp true \
          --src_len 2048 \
          --tgt_len 128 \
          --train_micro_batch_size_per_gpu 1 \
          --gradient_accumulation_steps 16 \
          --stage 3 \
          --max_lr 1e-4 \
          --initial_lr 1e-6 \
          --min_lr 1e-8 \
          --weight_decay 0.01 \
          --adam_beta1 0.9 \
          --adam_beta2 0.999\
          --epochs 3 \
          --output_dir ./outputs/tsqp/gemma3/finqa/ \
          --finetune_method full-tuning \
          --ds_config_path ./config/ds_config.json \
          --offload_device cpu \
          --nvme_path ./mnt/nvme

