#!/bin/bash
#SBATCH --job-name=datasetgen
#SBATCH --partition=h100                   # Use GPU partition "a100", "h100sxm"
#SBATCH --gres gpu:2                       # Set 2 GPUs per job
#SBATCH -c 32                              # Number of cores
#SBATCH -N 1                               # Ensure that all cores are on one machine
#SBATCH -t 4-00:00                         # Maximum run-time in D-HH:MM
#SBATCH --mem=256G                         # Memory pool for all cores
#SBATCH --output=%j.out                    # File to which STDOUT will be written
#SBATCH --error=%j.err                     # File to which STDERR will be written


# Override both partition and GPU count
# sbatch -p a100 --gres gpu:a100:8 exps/datasetgen/scripts/build_dataset.sh
# sbatch -p h100 --gres gpu:h100:8 exps/datasetgen/scripts/build_dataset.sh -c exps/datasetgen/scripts/configs/eval_dataset.sh
# sbatch -p h100 --gres gpu:h100:8 exps/datasetgen/scripts/build_dataset.sh -c exps/datasetgen/scripts/configs/dataset_train.sh

# sbatch -p h100 --gres gpu:h100:8 exps/datasetgen/scripts/build_dataset.sh -c exps/datasetgen/scripts/configs/dataset_manual_spec_sz10.sh

# sbatch -p h100 --gres gpu:h100:8 exps/datasetgen/scripts/build_dataset.sh -c exps/datasetgen/scripts/configs/dataset_inferred_spec_sz2.sh


# Function to display usage information
usage() {
    echo "Usage: $0 [-c <experiment_config_file>]"
    echo "  -c <config>     Experiment-specific config file (optional)"
    exit 1
}

# Function for logging with timestamp
log() {
    echo "$(tput setaf 3)$(date '+%Y-%m-%d %H:%M:%S') $@$(tput sgr0)"
}

log_done() {
    echo "$(tput setaf 2)$(date '+%Y-%m-%d %H:%M:%S') $@$(tput sgr0)"
}

log_error() {
    echo "$(tput setaf 1)$(date '+%Y-%m-%d %H:%M:%S') $@$(tput sgr0)"
}

# Parse command line arguments
while getopts ":c:" opt; do
    case ${opt} in
        c )
            CONFIG_FILE=$OPTARG
            ;;
        \? )
            usage
            ;;
    esac
done

# Set default config file if not provided
if [ -z "$CONFIG_FILE" ]; then
    CONFIG_FILE="exps/datasetgen/scripts/configs/default_config.sh"
fi

# Check if config file exists
if [ ! -f "$CONFIG_FILE" ]; then
    log_error "Error: Experiment config file '$CONFIG_FILE' not found."
    exit 1
fi

############################################################
# Source configuration files
############################################################
source "$CONFIG_FILE"
source "exps/datasetgen/scripts/configs/global_config.sh"

############################################################
# Determine SEED_FILE based on ITERATION
############################################################
if [ "$ITERATION" -eq 0 ]; then
    SEED_FILE="$SEED_FILE_ITER0"
else
    SEED_FILE="exps/datasetgen/results/${DATE}/iter${ITERATION}/seed_dataset_iter${ITERATION}.json"
fi

############################################################
# GPU Configs
############################################################
NUM_GPUS=$(echo "$CUDA_VISIBLE_DEVICES" | tr ',' '\n' | wc -l)
# assert if TENSOR_PARALLEL_SIZE is the same as NUM_GPUS
if [ "$TENSOR_PARALLEL_SIZE" -ne "$NUM_GPUS" ]; then
    log_error "WARNING: TENSOR_PARALLEL_SIZE ($TENSOR_PARALLEL_SIZE) is not the same as NUM_GPUS ($NUM_GPUS)"
fi

############################################################
# Configuration
############################################################
log "ITERATION: $ITERATION"
log "DATE: $DATE"
log "--------------------------------"
log "CODEGEN_PROMPT_NAME: $CODEGEN_PROMPT_NAME"
log "CODESCORING_PROMPT_NAME: $CODESCORING_PROMPT_NAME"
log "--------------------------------"
log "Experiment Config: $CONFIG_FILE"
log "Seed file: $SEED_FILE"
log "--------------------------------"
log "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION"
log "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
log "Number of GPUs allocated: $NUM_GPUS"
log "TENSOR_PARALLEL_SIZE: $TENSOR_PARALLEL_SIZE"
log "VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
log "--------------------------------"

############################################################
# Nested Functions for Modular Structure
############################################################

codegen() {
    CODEGEN_PROMPT_PATTERN="exps/datasetgen/results/${DATE}/iter${ITERATION}/codegen/prompts/prompts_${CODEGEN_PROMPT_NAME}_part_*.json"

    # Generate Prompts
    codegen_generate_prompts() {
        log "CODEGEN (1. Generate Prompts): Starting..."

        # Check if prompt parts exist
        if ls ${CODEGEN_PROMPT_PATTERN} >/dev/null 2>&1; then
            log_done "CODEGEN (1. Generate Prompts): All prompt parts exist. Skipping generation."
        else
            log "CODEGEN (1. Generate Prompts): Generating new prompts... (using seed file: ${SPEC_FILE:-$SEED_FILE})"
            ${PYTHON_TURTLE} ${BUILD_PROMPT_PYTHON_FILE} \
                --input_file "${SEED_FILE}" \
                --prompt_template "${CODEGEN_PROMPT_TEMPLATE}" \
                --output_file "${CODEGEN_PROMPT_FILE}" \
                --seed_file "${SPEC_FILE:-$SEED_FILE}" \
                --n_combine ${N_COMBINE} \
                --n_combs_sample ${N_COMBS_SAMPLE} \
                --max_num_items 150000
        fi

        log_done "CODEGEN (1. Generate Prompts): Finished."
    }

    # Generate Responses
    codegen_generate_responses() {
        log "CODEGEN (2. Generate Responses): Starting..."

        # Count prompt parts
        if $DEBUG_MODE; then
            local PROMPT_PARTS_COUNT=1
        else
            local PROMPT_PARTS_COUNT=$(ls ${CODEGEN_PROMPT_PATTERN} 2>/dev/null | wc -l)
        fi

        # Flag to check if any job was submitted
        local JOB_SUBMITTED=false

        # Submit jobs for missing parts in reverse order (process part 0 last)
        for i in $(seq $((PROMPT_PARTS_COUNT - 1)) -1 0); do
            local PART_FILE="${CODEGEN_RESPONSES_DIR}/${CODEGEN_PROMPT_NAME}__${CODEGEN_MODEL_NAME_SHORT}__${DATE}_part_${i}.json"

            if [ ! -f "$PART_FILE" ]; then
                if [ "$i" -eq 0 ]; then
                    log "CODEGEN (2. Generate Responses): Generating part $i in the current bash session"
                    bash exps/datasetgen/scripts/codegen/build_responses.sh -c $CONFIG_FILE -p $i
                else
                    log "CODEGEN (2. Generate Responses): Submitting job for part $i"
                    # if odd number --> a100, otherwise h100 (so we can submit more jobs)
                    if [ $((i % 2)) -eq 1 ]; then
                        sbatch \
                            -p a100 \
                            --gres gpu:a100:${NUM_GPUS} \
                            exps/datasetgen/scripts/codegen/build_responses.sh -c $CONFIG_FILE -p $i
                    else
                        sbatch \
                            -p h100 \
                            --gres gpu:h100:${NUM_GPUS} \
                            exps/datasetgen/scripts/codegen/build_responses.sh -c $CONFIG_FILE -p $i
                    fi
                    JOB_SUBMITTED=true
                fi
            fi
        done

        # Exit if jobs are submitted
        if $JOB_SUBMITTED; then
            log "CODEGEN (2. Generate Responses): Jobs submitted for missing parts. Exiting."
            exit 0
        fi

        log_done "CODEGEN (2. Generate Responses): Finished."
    }

    # Postprocess Responses
    codegen_postprocess_responses() {
        log "CODEGEN (3. Postprocess Responses): Starting..."

        # Count prompt parts
        if $DEBUG_MODE; then
            local PROMPT_PARTS_COUNT=1
        else
            local PROMPT_PARTS_COUNT=$(ls ${CODEGEN_PROMPT_PATTERN} 2>/dev/null | wc -l)
        fi

        # Flag to check if any job was submitted
        local JOB_SUBMITTED=false

        # Postprocess missing parts in reverse order (process part 0 last)
        for i in $(seq $((PROMPT_PARTS_COUNT - 1)) -1 0); do
            local INPUT="${CODEGEN_RESPONSES_DIR}/${CODEGEN_PROMPT_NAME}__${CODEGEN_MODEL_NAME_SHORT}__${DATE}_part_${i}.json"
            local OUTPUT="${CODEGEN_POSTPROCESS_DIR}/${CODEGEN_PROMPT_NAME}__${CODEGEN_MODEL_NAME_SHORT}__${DATE}_part_${i}.json"

            if [ ! -f "${OUTPUT}" ]; then
                if [ "$i" -eq 0 ]; then
                    log "CODEGEN (3. Postprocess Responses): Generating part $i in the current bash session"
                    bash exps/datasetgen/scripts/codegen/postprocess.sh -c $CONFIG_FILE -p $i
                else
                    log "CODEGEN (3. Postprocess Responses): Submitting job for part $i"
                    sbatch exps/datasetgen/scripts/codegen/postprocess.sh -c $CONFIG_FILE -p $i
                    JOB_SUBMITTED=true
                fi
            fi
        done

        # Exit if jobs are submitted
        if $JOB_SUBMITTED; then
            log "CODEGEN (3. Postprocess Responses): Jobs submitted for missing parts. Exiting."
            exit 0
        fi

        log_done "CODEGEN (3. Postprocess Responses): Finished."
    }

    # Execute CODEGEN related tasks
    codegen_generate_prompts
    codegen_generate_responses
    codegen_postprocess_responses
}

codededup() {

    # Merge Datasets
    codededup_merge_datasets() {
        log "CODEDEDUP (1. Merge Datasets): Starting..."

        local INPUT=$(find ${CODEGEN_POSTPROCESS_DIR} -regex ".*/${CODEGEN_PROMPT_NAME}__${CODEGEN_MODEL_NAME_SHORT}__${DATE}_part_[0-9]+\.json" 2>/dev/null)
        local OUTPUT="exps/datasetgen/results/${DATE}/iter${ITERATION}/codededup/merged_${CODEGEN_PROMPT_NAME}__${CODEGEN_MODEL_NAME_SHORT}__${DATE}.json"

        # if output not exists and input parts exist
        if [ ! -f "${OUTPUT}" ] && ls ${INPUT} > /dev/null 2>&1; then
            log "CODEDEDUP (1. Merge Datasets): Merging multiple parts with seed file"
            ${PYTHON_TURTLE} src/turtlegfx_datagen/codededup/merge_datasets.py \
                --input_paths ${INPUT} ${SEED_FILE} \
                --output_path ${OUTPUT}
        else
            log_done "CODEDEDUP (1. Merge Datasets): Merged file ${OUTPUT} already exists or no input parts to merge. Skipping."
        fi
        log_done "CODEDEDUP (1. Merge Datasets): Finished."
    }

    # Remove Duplicates
    codededup_remove_duplicates() {
        log "CODEDEDUP (2. Remove Duplicates): Starting..."

        DEDUP_OUTPUT="${CODEDEDUP_DIR}/dedup_${CODEGEN_PROMPT_NAME}__${CODEGEN_MODEL_NAME_SHORT}__${DATE}.json"
        DEDUP_INPUT="${CODEDEDUP_DIR}/merged_${CODEGEN_PROMPT_NAME}__${CODEGEN_MODEL_NAME_SHORT}__${DATE}.json"

        if [ ! -f "${DEDUP_OUTPUT}" ] && ls ${DEDUP_INPUT} > /dev/null 2>&1; then
            log "CODEDEDUP (2. Remove Duplicates): Removing Duplicates"
            ${PYTHON_TURTLE} src/turtlegfx_datagen/codededup/remove_duplicates.py \
                --input_path ${DEDUP_INPUT} \
                --output_path ${DEDUP_OUTPUT} \
                --remove_duplicates \
                --eps ${EPS} \
                --min_samples ${MIN_SAMPLES} \
                --batch_size 256
        else
            log_done "CODEDEDUP (2. Remove Duplicates): Deduped file already exists or no input files to dedup. Skipping."
        fi

        log_done "CODEDEDUP (2. Remove Duplicates): Finished."
    }

    # Execute CODEDEDUP related tasks
    codededup_merge_datasets
    codededup_remove_duplicates
}

codescoring() {
    CODESCORING_PROMPT_PATTERN="exps/datasetgen/results/${DATE}/iter${ITERATION}/codescoring/prompts/prompts_${CODESCORING_PROMPT_NAME}__${DATE}_part_*.json"

    codescoring_generate_prompts() {
        log "CODESCORING (1. Generate Prompts): Starting..."

        local CODESCORING_PROMPT_FILE="${CODESCORING_PROMPTS_DIR}/prompts_${CODESCORING_PROMPT_NAME}__${DATE}.json"

        # Check if prompt parts exist
        if ls ${CODESCORING_PROMPT_PATTERN} >/dev/null 2>&1; then
            log_done "CODESCORING (1. Generate Prompts): All prompt parts exist. Skipping generation."
        else
            log "CODESCORING (1. Generate Prompts): Generating new prompts..."
            ${PYTHON_QWEN} src/turtlegfx_datagen/codescoring/build_prompts_codescoring_qwen2vl.py \
                --input_file "${DEDUP_OUTPUT}" \
                --prompt_template "${CODESCORING_PROMPT_TEMPLATE}" \
                --output_file "${CODESCORING_PROMPT_FILE}" \
                --max_num_items 150000
        fi

        log_done "CODESCORING (1. Generate Prompts): Finished."
    }

    codescoring_generate_responses() {
        log "CODESCORING (2. Generate Responses): Starting..."


        if $DEBUG_MODE; then
            local PROMPT_PARTS_COUNT=1
        else
            local PROMPT_PARTS_COUNT=$(ls ${CODESCORING_PROMPT_PATTERN} 2>/dev/null | wc -l)
        fi

        # Flag to check if any job was submitted
        local JOB_SUBMITTED=false

        # Submit jobs for missing parts in reverse order (process part 0 last)
        for i in $(seq $((PROMPT_PARTS_COUNT - 1)) -1 0); do
            local OUTPUT="${CODESCORING_RESPONSES_DIR}/${CODESCORING_PROMPT_NAME}__${CODESCORING_MODEL_NAME_SHORT}__${DATE}_part_${i}.json"

            if [ ! -f "${OUTPUT}" ]; then
                if [ "$i" -eq 0 ]; then
                    log "CODESCORING (2. Generate Responses): Generating part $i in the current bash session"
                    bash exps/datasetgen/scripts/codescoring/build_responses.sh -c $CONFIG_FILE -p $i
                else
                    log "CODESCORING (2. Generate Responses): Submitting job for part $i"
                    sbatch \
                        -p ${SLURM_JOB_PARTITION} \
                        --gres gpu:${SLURM_JOB_PARTITION}:${NUM_GPUS} \
                        exps/datasetgen/scripts/codescoring/build_responses.sh -c $CONFIG_FILE -p $i
                    JOB_SUBMITTED=true
                fi
            fi
        done

        # Exit if jobs are submitted
        if $JOB_SUBMITTED; then
            log_done "CODESCORING (2. Generate Responses): Jobs submitted for missing parts. Exiting."
            exit 0
        fi

        log_done "CODESCORING (2. Generate Responses): Finished."
    }

    codescoring_postprocess() {
        log "CODESCORING (3. Postprocess Responses): Starting..."

        local INPUT=$(find ${CODESCORING_RESPONSES_DIR} -regex ".*/${CODESCORING_PROMPT_NAME}__${CODESCORING_MODEL_NAME_SHORT}__${DATE}_part_[0-9]+\.json" 2>/dev/null)
        # next iteration 
        local OUTPUT="exps/datasetgen/results/${DATE}/iter$((ITERATION+1))/seed_dataset_iter$((ITERATION+1)).json"

        if [ ! -f "${OUTPUT}" ] && ls ${INPUT} > /dev/null 2>&1; then
            bash exps/datasetgen/scripts/codescoring/postprocess.sh -c $CONFIG_FILE
        else
            log_done "CODESCORING (3. Postprocess Responses): Postprocessed file ${OUTPUT} already exists. Skipping."
        fi

        log_done "CODESCORING (3. Postprocess Responses): Finished."
    }

    codescoring_generate_prompts
    codescoring_generate_responses
    codescoring_postprocess
}

# Execute requested steps based on PIPELINE variable
log "Running pipeline: $PIPELINE"
for step in $PIPELINE; do
    log "Running pipeline step: $step"
    case $step in
        "codegen")
            codegen
            ;;
        "codededup")
            codededup
            ;;
        "codescoring")
            codescoring
            ;;
        *)
            log_error "Unknown step: $step"
            ;;
    esac
done
log_done "---------Dataset Generation Complete---------"

# Override config values if specified via command line
if [ ! -z "$CLI_ITERATION" ]; then
    ITERATION=$CLI_ITERATION
    log "Using command line ITERATION: $ITERATION"
fi

if [ ! -z "$CLI_DATE" ]; then
    DATE=$CLI_DATE
    log "Using command line DATE: $DATE"
fi

# Validate required parameters
if [ -z "$ITERATION" ]; then
    log_error "Error: ITERATION must be set either in config file or via command line"
    exit 1
fi

if [ -z "$DATE" ]; then
    log_error "Error: DATE must be set either in config file or via command line"
    exit 1
fi