
learning_rate=3e-7
loss_type="rmj_dpo"  
beta="0.03"
epochs="1"


safe_beta=${beta//./_}
beta_dir="beta${safe_beta}"      
epoch_dir="epoch${epochs}"  
learning_rate_dir="lr${learning_rate}" 

warmup="warmup02" 

BASE_OUTPUT=/checkpoints
BASE_LOG=/logs
BASE_TRAINED=/models
BASE_REF=/dispersion_values


OUTPUT_DIR=${BASE_OUTPUT}/${loss_type}/${learning_rate_dir}/${beta_dir}/${epoch_dir}/${warmup}
LOGGING_DIR=${BASE_LOG}/${loss_type}/${learning_rate_dir}/${beta_dir}/${epoch_dir}/${warmup}
TRAINED_DIR=${BASE_TRAINED}/${loss_type}/${learning_rate_dir}/${beta_dir}/${epoch_dir}/${warmup}
REF_DISP_DIR=${BASE_REF}/${loss_type}/${learning_rate_dir}/${beta_dir}/${epoch_dir}/${warmup}




accelerate launch --main_process_port ${MASTER_PORT} --num_processes 7 \
  --config_file accelerate_deepspeed_config.yaml DPO.py \
  --model_name_or_path /models/Meta-Llama-3-8B-Instruct  \
  --data_path  /data/llama3-ultrafeedback-SkyworkV2_pair_responses  \
  --loss_type "${loss_type}" \
  --per_device_train_batch_size 2 \
  --gradient_accumulation_steps 8 \
  --num_train_epochs "${epochs}" \
  --max_length 4096 \
  --max_prompt_length 1024 \
  --max_completion_length 1500 \
  --save_steps 1000 \
  --logging_steps 5 \
  --beta "${beta}" \
  --learning_rate "${learning_rate}" \
  --output_dir "${OUTPUT_DIR}" \
  --logging_dir "${LOGGING_DIR}" \
  --dpo_finetuned_model_saved_dir "${TRAINED_DIR}" \
  --output_reference_dispersion_local_dir "${REF_DISP_DIR}" \
  > >(tee -a "${LOGGING_DIR}/stdout.log") \
  2> >(tee -a "${LOGGING_DIR}/stderr.log" >&2)

