#!/bin/bash  
CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1 deepspeed dp_finetune.py --deepspeed_config dp_ZeRO_config.json \
  --output_dir output_test \
  --model_name_or_path Llama-3.2-1B-Instruct \
  --tokenizer_name Llama-3.2-1B-Instruct \
  --do_train "yes" \
  --save_steps 2 \
  --logging_dir output_test \
  --logging_steps 1 \
  --seed 2025 \
  --dataloader_num_workers 4 \
  --train_data_file data/train.jsonl \
  --per_example_max_grad_norm 0.1 \
  --target_delta 1e-5 \
  --target_epsilon 5 \
  --learning_rate 2e-4 \
  --lr_decay "no" \
  --num_train_epochs 1 \
  --per_device_train_batch_size 1 \
  --gradient_accumulation_steps 1 \
  --attention_only "no" \
  --bias_only "no" \
  --static_lm_head "no" \
  --static_embedding "no" \
  --non_private "no" \
  --save_at_last "yes"
  
  
  
  CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES deepspeed dp_finetune.py --deepspeed_config dp_ZeRO_config.json \
  --output_dir Llama-3.2-1B-Instruct-SFT-DP \
  --model_name_or_path Llama-3.2-1B-Instruct \
  --tokenizer_name Llama-3.2-1B-Instruct \
  --do_train "yes" \
  --save_steps 300 \
  --logging_dir Llama-3.2-1B-Instruct-SFT-DP \
  --logging_steps 50 \
  --seed 2025 \
  --dataloader_num_workers 4 \
  --train_data_file train.jsonl \
  --per_example_max_grad_norm 0.1 \
  --target_delta 1e-5 \
  --target_epsilon 5 \
  --learning_rate 2e-4 \
  --lr_decay "no" \
  --num_train_epochs 1 \
  --per_device_train_batch_size 1 \
  --gradient_accumulation_steps 8 \
  --attention_only "no" \
  --bias_only "no" \
  --static_lm_head "no" \
  --static_embedding "no" \
  --non_private "no" \
  --save_at_last "yes"
  
CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES NCCL_ASYNC_ERROR_HANDLING=1 NCCL_IB_DISABLE=1 NCCL_TIMEOUT=600 deepspeed dp_finetune.py --deepspeed_config dp_ZeRO_config.json \
  --output_dir Llama-3.2-1B-Instruct-SFT-DP \
  --model_name_or_path Llama-3.2-1B-Instruct  \
  --tokenizer_name Llama-3.2-1B-Instruct  \
  --do_train "yes" \
  --save_steps 600 \
  --save_at_last "yes" \
  --logging_dir Llama-3.2-1B-Instruct-SFT-DP \
  --logging_steps 5 \
  --seed 2025 \
  --dataloader_num_workers 12 \
  --train_data_file train.jsonl \
  --per_example_max_grad_norm 0.1 \
  --target_delta 1e-7 \
  --target_epsilon 8 \
  --learning_rate 5e-5 \
  --lr_decay "no" \
  --num_train_epochs 2 \
  --per_device_train_batch_size 1 \
  --gradient_accumulation_steps 64 \
  --attention_only "no" \
  --bias_only "no" \
  --static_lm_head "no" \
  --static_embedding "no" \
  --non_private "no"
  
  
