#!/bin/bash
#SBATCH --job-name="RM_phi_training"      # Assign a short name to your job
#SBATCH --nodes=1                   # Number of nodes you require
#SBATCH --ntasks=1                  # Total number of tasks across all nodes
#SBATCH --cpus-per-task=4           # Number of CPU cores per task
#SBATCH --mem=80gb                   # Job memory request
#SBATCH --time=7-00:00:00             # Time limit hrs:min:sec
#SBATCH --output=RM_phi_training-%j.out   # Standard output and error log
#SBATCH --error=RM_phi_training-%j.err    # Standard error log file
#SBATCH --partition=week-long-gpu       # GPU partition
#SBATCH --gres=gpu:1             # Request 1 GPU

echo "Starting the GPU job!"
date                                # Output the current date and time

# Load module if necessary (depends on the cluster setup)
 

# export CUDA_VISIBLE_DEVICES=1
# Load module if necessary (depends on the cluster setup)
conda activate py310 
# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# Run your script or application that needs a GPU

# Add verbose logging to diagnose the issue
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1
export DEEPSPEED_LOG_LEVEL=info
 
# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# accelerate launch hf_train.py --num_processes 2
CUDA_VISIBLE_DEVICES=3 accelerate launch reward_model_training_phi.py    --output_dir=“dro_experiments_PHI_RM” 
echo "GPU Job completed!"
date                                # Output the current date and time again
 

