pip3 install deepspeed==0.14.0

MASTER_ADDR=${ARNOLD_WORKER_0_HOST}
MASTER_PORT=(${ARNOLD_WORKER_0_PORT//,/ })
NPROC_PER_NODE=${ARNOLD_WORKER_GPU}
NNODES=${ARNOLD_WORKER_NUM}
NODE_RANK=${ARNOLD_ID}
export MASTER_ADDR
export MASTER_PORT
export NPROC_PER_NODE
export NNODES
export NODE_RANK

cd /.../.../.../fastchat/

RUN_CMD="torchrun --master_port=${MASTER_PORT} --master_addr=${MASTER_ADDR} --nproc_per_node=${NPROC_PER_NODE} --nnodes=${NNODES} --node_rank=${NODE_RANK} "
export RUN_CMD


OTHER="--my_task_name=starcoder2-3b-code-dpo_torchrun --ddp_find_unused_parameters=False --save_hdfs_path=....../home/.../.../user/.../metadata/generate/codedpo/starcoder2-3b/starcoder2-3b-code-dpo-multinode_${NNODES}_${NODE_RANK} --num_train_epochs=10 --save_on_each_node=False --deepspeed=/.../.../.../fastchat/accelerate_config/ds_config_zero3.json  "
export OTHER

bash dpo.code.a100.starcoder2-3b.sh
