#!/bin/bash
#SBATCH -p AI4Phys
#SBATCH --job-name=suzuki_all
#SBATCH --output=suzuki_all.output
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --gres=gpu:2

source activate llama


# MASTER_ADDR=`scontrol show hostname $SLURM_JOB_NODELIST | head -n1`
# MASTER_PORT=$((RANDOM % 101 + 20000))

# export MASTER_ADDR=$MASTER_ADDR
# export MASTER_PORT=$MASTER_PORT
# echo $MASTER_ADDR
# echo $MASTER_PORT

# function makehostfile() {
# perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"};
# $slots=8 if $slots==0; # workaround 8 gpu machines
# @nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}];
# print map { "$b$_ slots=$slots\n" } @nodes'
# }
# makehostfile > hostfile
# hostfile=""


# --include='localhost' \
srun deepspeed --launcher SLURM \
yield_ft_ds.py \
   --pretrained_model_path '/mnt/shared-storage-user/caipengxiang/H200-share/models/share/step1_llama3_8b_0916_yearly_pistachio_ep3' \
   --lora_adapter_path "/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/train_regression/saved_models/lora_adapter" \
   --yield_predictor_path "/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/train_regression/saved_models/llama_ep3_1115-18/predictor.pt" \
   --num_epoch 100 \
   --lr 1e-4 \
   --data_path '/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/train_regression/data4regression' \
   --data_name 'suzuki_miyaura_fg_changes_all' \
   --per_device_train_batch_size 4 \
   --save_root '/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/train_regression/saved_models/suzuki_miyaura_fg_changes_all' \
   --gradient_accumulation_steps 1 \
   --use_lora 1 \
   --log_file "training_ds.log" \
   --deepspeed_config 'yield_ft_ds_config.json' \
   --mlp_lr_multiplier 1 \
   --save_interval 20
   

   # --load_ds_dir "/mnt/hwfile/ai4chem/chenjianpeng/train_regression_fg_info/llama_ep3_1115-18/checkpoints" \
   # --load_ds_ckpt_id 

