#!/bin/bash

# Script to run hyperparameter optimization on all local datasets

# Default parameters
N_TRIALS=100
EPOCHS=100
DEVICE="cuda"
OUTPUT_DIR="./runs"
MAX_PARALLEL=22
STORAGE_BASE="sqlite:///optuna_db"
DATA_ROOT="../data/full_datasets"  # Adjust this to the correct path

# Create directories for logs and output
mkdir -p logs
mkdir -p ${OUTPUT_DIR}
mkdir -p optuna_db

# Get list of available tasks by scanning the data directory
TASK_IDS=()
for TASK_DIR in ${DATA_ROOT}/openml_task_*; do
    if [ -d "$TASK_DIR" ]; then
        # Extract task ID from directory name
        TASK_ID=$(basename "$TASK_DIR" | sed 's/openml_task_//')
        
        # Check if required files exist
        if [ -f "${TASK_DIR}/X_train.npy.gz" ] && 
           [ -f "${TASK_DIR}/X_test.npy.gz" ] && 
           [ -f "${TASK_DIR}/y_train.npy.gz" ] && 
           [ -f "${TASK_DIR}/y_test.npy.gz" ]; then
            TASK_IDS+=($TASK_ID)
        fi
    fi
done

# Check if any tasks were found
if [ ${#TASK_IDS[@]} -eq 0 ]; then
    echo "Error: No valid task directories found in ${DATA_ROOT}"
    echo "Make sure the path is correct and contains folders with the expected structure."
    exit 1
fi

# Function to run a single task
run_task() {
    local task_id=$1
    echo "Starting task ${task_id}"
    
    python mlp_c.py \
        --task_id "${task_id}" \
        --n_trials "${N_TRIALS}" \
        --epochs "${EPOCHS}" \
        --device "${DEVICE}" \
        --output_dir "${OUTPUT_DIR}" \
        --storage "${STORAGE_BASE}/task_${task_id}.db" \
        --data_root "${DATA_ROOT}" \
        2>&1 | tee "logs/task_${task_id}.log"

    local exit_code=${PIPESTATUS[0]}
    if [ ${exit_code} -eq 0 ]; then
        echo "Task ${task_id} completed successfully"
    else
        echo "Task ${task_id} failed with exit code ${exit_code}"
    fi
}

# Print configuration
echo "Running with the following configuration:"
echo "Number of tasks: ${#TASK_IDS[@]}"
echo "Maximum parallel processes: ${MAX_PARALLEL}"
echo "Number of trials: ${N_TRIALS}"
echo "Epochs: ${EPOCHS}"
echo "Device: ${DEVICE}"
echo "Output directory: ${OUTPUT_DIR}"
echo "Optuna storage base: ${STORAGE_BASE}"
echo "Data root directory: ${DATA_ROOT}"
echo "----------------------------------------"
echo "Tasks to process: ${TASK_IDS[@]}"
echo "----------------------------------------"

# Export the function so it can be used with xargs
export -f run_task

# Export variables so they're available to the subshells
export N_TRIALS EPOCHS DEVICE OUTPUT_DIR STORAGE_BASE DATA_ROOT

# Run tasks in parallel using xargs
printf "%s\n" "${TASK_IDS[@]}" | xargs -I {} -P ${MAX_PARALLEL} bash -c 'run_task "$@"' _ {}

echo "All tasks completed. Check logs directory for detailed output."