#!/usr/bin/env bash
set -euo pipefail  # exit on error, undefined variable, or pipeline failure

#############################
#     CONFIGURABLE PARTS    #
#############################
total_jobs=54      # total tasks in your parameter sweep
batch_size=18       # number of tasks per batch (cannot exceed system limit of 18)
concurrency=6       # max parallel tasks in each array

# System-imposed maximum for array tasks
max_array_tasks=18

# Name of your Slurm script to be called:
slurm_script="cardinality_decision_lr_L2D.slurm"

# Retry settings for sbatch submission
max_attempts=3
retry_delay=30  # seconds to wait before retrying

#############################
#      Parameter Checks     #
#############################
if (( batch_size > max_array_tasks )); then
    echo "Error: batch_size ($batch_size) exceeds system limit of $max_array_tasks tasks per array."
    exit 1
fi

if (( concurrency > batch_size )); then
    echo "Warning: concurrency ($concurrency) is greater than batch_size ($batch_size). Adjusting concurrency to $batch_size."
    concurrency=$batch_size
fi

#############################
#   Function Definitions    #
#############################

# submit_job attempts to submit a job array with retries if it fails.
submit_job() {
    local attempt=1
    local output
    while [ $attempt -le $max_attempts ]; do
        echo "Attempt $attempt/$max_attempts to submit tasks ${start_idx}..${end_idx}..."
        # Capture both stdout and stderr
        if output=$(sbatch --array=${start_idx}-${end_idx}%${concurrency} "$slurm_script" 2>&1); then
            echo "sbatch command returned: $output"
            echo "$output"
            return 0
        else
            echo "Error in sbatch submission: $output"
            if [ $attempt -lt $max_attempts ]; then
                echo "Retrying in ${retry_delay} seconds..."
                sleep $retry_delay
            fi
        fi
        attempt=$((attempt + 1))
    done

    echo "Error: sbatch submission failed after ${max_attempts} attempts. Exiting."
    exit 1
}

#############################
#        MAIN LOOP          #
#############################
echo "Starting submission of ${total_jobs} jobs in batches of ${batch_size}, with concurrency limit ${concurrency}."

for ((start_idx=0; start_idx<total_jobs; start_idx+=batch_size)); do
    # Determine the last task index in this batch
    end_idx=$((start_idx + batch_size - 1))
    if [ "$end_idx" -ge "$total_jobs" ]; then
        end_idx=$((total_jobs - 1))
    fi

    echo ""
    echo "Submitting tasks ${start_idx}..${end_idx}..."

    #############################
    # Submit the Slurm array job
    #############################
    sbatch_output=$(submit_job)

    # Extract the job ID using a regex that looks for "Submitted batch job <jobid>"
    job_id=$(echo "$sbatch_output" | grep -oP 'Submitted batch job \K[0-9]+')
    if [ -z "$job_id" ]; then
        echo "Error: Failed to retrieve job ID from sbatch output. Exiting."
        exit 1
    fi
    echo "Submitted batch (array) with parent job ID: $job_id"

    #############################
    # Wait for the batch to finish
    #############################
    echo "Waiting for all tasks in array ${job_id} to complete..."
    # Check squeue for any job IDs that either exactly match the parent or include an underscore (for array tasks).
    while squeue -u "$USER" -h | awk '{print $1}' | grep -E -q "^${job_id}(_[0-9]+)?$"; do
        sleep 10
    done

    echo "Batch (job ID ${job_id}) completed."
done

echo ""
echo "All batches complete!"
