#!/bin/bash
#SBATCH --job-name="RM_log"      # Assign a short name to your job
#SBATCH --nodes=1                   # Number of nodes you require
#SBATCH --ntasks=1                  # Total number of tasks across all nodes
#SBATCH --cpus-per-task=4           # Number of CPU cores per task
#SBATCH --mem=80gb                   # Job memory request
#SBATCH --time=7-00:00:00             # Time limit hrs:min:sec
#SBATCH --output=RM_log-%j.out   # Standard output and error log
#SBATCH --error=RM_log-%j.err    # Standard error log file
#SBATCH --partition=week-long-gpu       # GPU partition
#SBATCH --gres=gpu:1             # Request 1 GPU

echo "Starting the GPU job!"
date                                # Output the current date and time


# export CUDA_VISIBLE_DEVICES=1
# Load module if necessary (depends on the cluster setup)
conda activate py310 

export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1
export DEEPSPEED_LOG_LEVEL=info
 
# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# accelerate launch hf_train.py --num_processes 2
CUDA_VISIBLE_DEVICES=0 accelerate launch Oracletraining.py    --output_dir=“Oracletraining_dir” 
echo "GPU Job completed!"
date                                # Output the current date and time again
 

