#!/bin/bash

export model_name=$1
export num_nodes=$2
export num_gpus=$3
export trial=$4

export TRANSFORMERS_CACHE='/data/private_models/xx_models/huggingface'

source /opt/rh/devtoolset-7/enable

function makehostfile() {
    perl -e '$slots=split /,/, $ENV{"SLURM_STEP_GPUS"};
    @nodes = split /\n/, qx[scontrol show hostnames $ENV{"SLURM_JOB_NODELIST"}];
    foreach $node (@nodes) {
        $gpus = qx[ssh $node nvidia-smi --list-gpus | wc -l];
        chomp($gpus);
        print "$node slots=$gpus\n";
    }'
}
makehostfile > hostfile_${trial}

WANDB__SERVICE_WAIT=3000 deepspeed --num_nodes=$num_nodes --num_gpus=$num_gpus --hostfile=hostfile_${trial} sft_merged_chatbot.py --model_name $model_name
