#!/bin/bash
# run.sh - Run full experiment pipeline with parallel fold processing

# Configuration
BASE_DIR="../datasets-orreview/ordinal-regression"
RESULTS_DIR="results"
VIS_DIR="vis/aggregated"
LOGS_DIR="logs"
GPU=0
MAX_PARALLEL_JOBS=30  # Control the maximum number of parallel folds

# Available datasets and link functions
DATASETS=("ERA" "LEV" "SWD" "car" "winequality-red")
LINKS=("logit" "probit")

# Create necessary directories
mkdir -p "$RESULTS_DIR"
mkdir -p "$VIS_DIR"
mkdir -p "$LOGS_DIR"

# Function to run experiment for a dataset and link function
run_single_experiment() {
    local dataset=$1
    local link=$2
    
    echo "=========================================="
    echo "Starting pipeline for dataset: $dataset, link function: $link"
    echo "=========================================="
    
    # Check if dataset exists
    if [ ! -d "$BASE_DIR/$dataset/weka" ]; then
        echo "Error: Dataset directory '$BASE_DIR/$dataset/weka' does not exist!"
        return 1
    fi
    
    # Create log directory for this dataset and link combination
    local log_dir="$LOGS_DIR/${dataset}_${link}"
    mkdir -p "$log_dir"
    
    # Step 1: Run experiment with parallel fold processing
    echo "Running experiment with parallel fold processing..."
    
    # Find all available folds for this dataset
    local available_folds=()
    for FOLD in $(seq 0 29); do
        if [ -f "$BASE_DIR/$dataset/weka/train_${dataset}-${FOLD}.arff" ]; then
            available_folds+=($FOLD)
        fi
    done
    
    echo "Found ${#available_folds[@]} available folds"
    
    # Process folds in parallel with limited concurrency
    pids=()  # Array to store background process IDs
    for FOLD in "${available_folds[@]}"; do
        # Wait if we've reached max parallel jobs
        while [ ${#pids[@]} -ge $MAX_PARALLEL_JOBS ]; do
            # Check which processes have completed
            for i in "${!pids[@]}"; do
                if ! kill -0 ${pids[$i]} 2>/dev/null; then
                    # Process completed, remove from tracking array
                    unset pids[$i]
                fi
            done
            # Re-index array to remove gaps
            pids=("${pids[@]}")
            # Short sleep to prevent CPU hogging
            sleep 0.5
        done
        
        echo "Starting fold $FOLD"
        
        # Run the experiment in background
        (
            CUDA_VISIBLE_DEVICES=$GPU \
                python -u experiment.py \
                    --dataset "$dataset" \
                    --fold    "$FOLD" \
                    --link    "$link" \
                > "$log_dir/fold${FOLD}.log" 2>&1
            
            echo "Completed fold $FOLD"
        ) &
        
        # Store the process ID
        pids+=($!)
    done
    
    # Wait for all remaining background processes to complete
    echo "Waiting for all folds to complete..."
    wait
    
    # Step 2: Generate plots
    echo "Generating plots..."
    python plot.py --dataset "$dataset" --link-function "$link" --vis-dir "$VIS_DIR"
    
    # Step 3: Combine plots
    echo "Combining plots..."
    python combine_plots.py --dataset "$dataset" --vis-dir "$VIS_DIR" --link-function "$link"
    
    echo "Pipeline completed for $dataset with $link"
    return 0
}

# Process command line arguments (same as original)
if [ $# -eq 0 ]; then
    # No arguments: process ALL datasets with ALL link functions
    echo "Processing all datasets with all link functions"
    
    for DATASET in "${DATASETS[@]}"; do
        for LINK in "${LINKS[@]}"; do
            run_single_experiment "$DATASET" "$LINK"
        done
    done
    
    echo "All processing completed!"
    exit 0
fi

# Process specific dataset(s) and link function(s)
if [ $# -ge 1 ] && [ $# -le 3 ]; then
    # Parse datasets (comma-separated)
    IFS=',' read -ra SELECTED_DATASETS <<< "$1"
    
    # Validate each dataset
    for DS in "${SELECTED_DATASETS[@]}"; do
        VALID=0
        for VALID_DS in "${DATASETS[@]}"; do
            if [ "$DS" == "$VALID_DS" ]; then
                VALID=1
                break
            fi
        done
        
        if [ "$VALID" -eq 0 ]; then
            echo "Error: Dataset '$DS' is not valid. Must be one of: ${DATASETS[*]}"
            exit 1
        fi
    done
    
    # Determine link functions to use
    SELECTED_LINKS=("${LINKS[@]}")  # Default to all links
    if [ $# -ge 2 ]; then
        # Parse link functions (comma-separated)
        IFS=',' read -ra SELECTED_LINKS <<< "$2"
        
        # Validate each link function
        for LK in "${SELECTED_LINKS[@]}"; do
            if [ "$LK" != "logit" ] && [ "$LK" != "probit" ]; then
                echo "Error: Link function '$LK' is not valid. Must be 'logit' or 'probit'"
                exit 1
            fi
        done
        
        # Check for third parameter (max parallel jobs)
        if [ $# -eq 3 ]; then
            if [[ "$3" =~ ^[0-9]+$ ]]; then
                MAX_PARALLEL_JOBS=$3
                echo "Setting maximum parallel jobs to $MAX_PARALLEL_JOBS"
            else
                echo "Error: Third parameter must be a number (maximum parallel jobs)"
                exit 1
            fi
        fi
    fi
    
    # Process each combination
    for DATASET in "${SELECTED_DATASETS[@]}"; do
        for LINK in "${SELECTED_LINKS[@]}"; do
            run_single_experiment "$DATASET" "$LINK"
        done
    done
    
    echo "Processing completed!"
    exit 0
fi

# Wrong number of arguments
echo "Usage: $0 [dataset1,dataset2,...] [link1,link2,...] [max_parallel_jobs]"
echo "If no arguments are provided, processes all datasets with all link functions"
echo "If only datasets are provided, processes those datasets with all link functions"
echo "Available datasets: ${DATASETS[*]}"
echo "Available link functions: ${LINKS[*]}"
echo "Default max parallel jobs: $MAX_PARALLEL_JOBS"
exit 1