#!/bin/bash
# Load .env if available
if [[ -f .env ]]; then
    set -a
    source .env
    set +a
fi

# Default values
LANGUAGE=""  # Empty means all tasks
MODEL=""  # Model name
SAMPLE_SIZES="100"  # Different sample sizes for evaluation
VALIDATION_SETS="long"  # Types of validation sets
PROMPT_TEMPLATE="io_prompt,zsr_prompt"  # Path to base prompt file (io_prompt,zsr_prompt)
VALIDATION_SIZE="100"  # Number of validation examples (default: all available data)
RESULTS_DIR="results"  # Directory to store results
MIN_LENGTH=""  # Minimum sequence length
MAX_PARALLEL=4  # Maximum parallel jobs
MODE="all"  # Mode of execution (create, submit, resubmit, process, all)
ENCODING_TECHNIQUE="many_to_one,one_to_one,one_to_many"  # Encoding technique (many_to_one, one_to_one, one_to_many)
MIN_LONG_VALIDATION_LENGTH=""  # Minimum length for long validation sequences
SEED=42  # Random seed for reproducibility
DATASET="flare_subsampled"  # Dataset parameter (must be 'flare')
CHECK_TOKENIZER=""

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --language)
            LANGUAGE="$2"  # Comma-separated list of tasks
            shift 2
            ;;
        --model)
            MODEL="$2"  # Store model name
            shift 2
            ;;
        --prompt_template)
            PROMPT_TEMPLATE="$2"  # Set path to base prompt file
            shift 2
            ;;
        --sample_sizes)
            SAMPLE_SIZES="$2"  # List of sample sizes
            shift 2
            ;;
        --validation_sets)
            VALIDATION_SETS="$2"  # List of validation sets
            shift 2
            ;;
        --validation_size)
            VALIDATION_SIZE="$2"  # Set validation size
            shift 2
            ;;
        --results_dir)
            RESULTS_DIR="$2"  # Directory to store results
            shift 2
            ;;
        --min_length)
            MIN_LENGTH="$2"  # Set minimum sequence length
            shift 2
            ;;
        --min_long_validation_length)
            MIN_LONG_VALIDATION_LENGTH="$2"  # Set minimum sequence length for long validation
            shift 2
            ;;
        --max_parallel)
            MAX_PARALLEL="$2"  # Set max parallel jobs
            shift 2
            ;;
        --mode)
            MODE="$2"  # Set workflow mode
            shift 2
            ;;
        --seed)
            SEED="$2"  # Set random seed for reproducibility
            shift 2
            ;;
        --encoding_technique)
            ENCODING_TECHNIQUE="$2"  # Set encoding technique
            shift 2
            ;;
        --dataset)
            if [[ "$2" != "flare_subsampled" ]]; then
                echo "Error: --dataset must be 'flare'"  # Ensure dataset is 'flare'
                exit 1
            fi
            DATASET="$2"  # Store dataset name
            shift 2
            ;;
        --dataset_dir)
            CLI_DATASET_DIR="$2"
            shift 2
            ;;
        *)
            echo "Unknown parameter: $1"  # Handle unknown parameters
            exit 1
            ;;
    esac
done

if [[ -z "$MODEL" ]]; then
    echo "Error: --model is required"
    show_usage
    exit 1
fi

if [[ -z "$DATASET" ]]; then
    echo "Error: --dataset is required"
    show_usage
    exit 1
fi

# Preserve .env value separately
ENV_DATASET_DIR="$DATASET_DIR"
unset DATASET_DIR


# Function to show usage
show_usage() {
    echo "Usage: $0 [options]"
    echo "Options:"
    echo "  --language <tasks>         Comma-separated list of tasks (empty for all languages)"
    echo "  --model <model>            Model name (required)"
    echo "  --prompt_template <file>   Prompt template (required to be one of 'io_prompt' or 'zsr_prompt')"
    echo "  --sample_sizes <sizes>     Comma-separated list of sample sizes (default: 10,50,100)"
    echo "  --validation_sets <sets>   Comma-separated list of validation sets (default: short,long)"
    echo "  --validation_size <size>   Number of validation examples (default: all available data)"
    echo "  --min_length <length>      Minimum sequence length"
    echo "  --min_long_validation_length <length>  Minimum sequence length for long validation"
    echo "  --balance_validation       Balance validation data"
    echo "  --max_parallel <n>         Maximum parallel jobs (default: 4)"
    echo "  --seed <seed>              Random seed for reproducibility (default: 42)"
    echo "  --encoding_technique       Must be one of 'many_to_one' 'one_to_one' 'one_to_many'"
}

# Create necessary directories
BASE_DIR="experiments"  # Main experiments directory
SAFE_MODEL_NAME="${MODEL//\//_}"  # Replace '/' with '_'
MODEL_DIR="$BASE_DIR/$SAFE_MODEL_NAME"  # Directory for the model's experiments


# Decide dataset directory
if [[ -n "$CLI_DATASET_DIR" ]]; then
    DATASET_DIR="$CLI_DATASET_DIR"
    echo "Using dataset directory from CLI: $DATASET_DIR"
elif [[ -n "$ENV_DATASET_DIR" ]]; then
    DATASET_DIR="$ENV_DATASET_DIR"
    echo "Using dataset directory from .env: $DATASET_DIR"
else
    DATASET_DIR="$DATASET"
    echo "Using default dataset directory: $DATASET_DIR"
fi

# Get all available tasks from the dataset directory
get_all_languages() {
    local tasks=()
    for dir in "$DATASET_DIR"/*; do
        if [ -d "$dir" ]; then
            tasks+=("$(basename "$dir")")
        fi
    done
    (IFS=','; echo "${tasks[*]}")
}

# Set LANGUAGES_LIST and LANGUAGES_ARRAY
if [[ -z "$LANGUAGE" ]]; then
    LANGUAGES_LIST=$(get_all_languages)
    echo "No tasks specified, using all available languages: $LANGUAGES_LIST"
else
    LANGUAGES_LIST="$LANGUAGE"
    echo "Processing specified languages: $LANGUAGES_LIST"
fi

# Get list of the specific encoding techniques to be processed (if none then all encoding techniques)
if [[ -z "$ENCODING_TECHNIQUE" ]]; then
    # Use default encoding techniques
    ENCODING_TECHNIQUES_LIST=("one_to_one" "one_to_many" "many_to_one")
    echo "No encoding techniques specified, using default encoding techniques: ${ENCODING_TECHNIQUES_LIST[@]}"
else
    # Use specified encoding techniques
    IFS=',' read -ra ENCODING_TECHNIQUES_LIST <<< "$ENCODING_TECHNIQUE"
    echo "Processing specified encoding techniques: ${ENCODING_TECHNIQUES_LIST[@]}"
fi

# Get list of the specific prompt templates to be processed (if none then use default)
if [[ -z "$PROMPT_TEMPLATE" ]]; then
    PROMPT_TEMPLATES_LIST=("io_prompt" "zsr_prompt")
    echo "No prompt templates specified, using default prompt templates: ${PROMPT_TEMPLATES_LIST[@]}"
else
    IFS=',' read -ra PROMPT_TEMPLATES_LIST <<< "$PROMPT_TEMPLATE"
    echo "Processing specified prompt templates: ${PROMPT_TEMPLATES_LIST[@]}"
fi

create_folders() {
    echo "Creating folders..."
    
    SAFE_MODEL_NAME="${MODEL//\//_}"  # Replace '/' with '_'
    MODEL_DIR="experiments/$SAFE_MODEL_NAME"

    # Convert LANGUAGES_LIST from comma-separated string to array
    IFS=',' read -ra LANG_ARRAY <<< "$LANGUAGES_LIST"

    for prompt_template in "${PROMPT_TEMPLATES_LIST[@]}"; do
        PROMPT_DIR="$MODEL_DIR/$prompt_template"
        for language in "${LANG_ARRAY[@]}"; do
            for encoding in "${ENCODING_TECHNIQUES_LIST[@]}"; do
                LANGUAGE_DIR="$PROMPT_DIR/$encoding/$language"
                mkdir -p "$LANGUAGE_DIR"
            done
        done
    done
}


execute_workflow() {
    echo "Executing $MODEL workflow..."
    for prompt_template in "${PROMPT_TEMPLATES_LIST[@]}"; do
        echo "With prompt template: $prompt_template"
        
        for encoding in "${ENCODING_TECHNIQUES_LIST[@]}"; do
            echo "With encoding: $encoding"
            for language in "${LANG_ARRAY[@]}"; do
                echo "Running pipeline for language: $language"
                
                CREATE_CMD="bash ./scripts/run_experiment.sh \
                    --language $language \
                    --model $MODEL \
                    --sample_sizes $SAMPLE_SIZES \
                    --validation_sets $VALIDATION_SETS \
                    --prompt_template $prompt_template \
                    --encoding_technique $encoding \
                    --seed $SEED"
                    
                if [[ ! -z "$VALIDATION_SIZE" ]]; then
                    CREATE_CMD="$CREATE_CMD --validation_size $VALIDATION_SIZE"
                fi
                
                echo "Executing command: $CREATE_CMD"
                eval $CREATE_CMD
                if [ $? -ne 0 ]; then
                    echo "Error: run_experiment.sh failed for $language"
                    exit 1
                fi
            done
        done
    done
}


create_folders
execute_workflow

echo "Workflow completed!"