#!/usr/bin/env bash
set -euo pipefail

#### CONFIGURATION ####
BASE_DIR="experiments_random_final"
IDENTIFIER="40st_10sym_redo"  # This is what the sampling script uses
AUTOMATON="${IDENTIFIER}"     # Keep consistent naming in our script

# Match the sampling parameters
NUM_SAMPLES=2000
ACCEPTANCE_PROB=0.3
NUM_STATES=40
NUM_SYMBOLS=10
MAIN_SEED=123  # The fixed seed used in sampling script

# Configuration for file paths and model directory
DATA_DIR="${BASE_DIR}/data/datasets"
MODELS_DIR="${BASE_DIR}/models_${IDENTIFIER}_${ACCEPTANCE_PROB}_${MAIN_SEED}"

# Match the intervention ranges from sampling script
INTERVENTION_START=50
INTERVENTION_END=1050
INTERVENTION_STEP=200

# Max parallel jobs
MAX_JOBS=96

# Debug: Print important variables
echo "DEBUG: IDENTIFIER = ${IDENTIFIER}"
echo "DEBUG: DATA_DIR = ${DATA_DIR}"
echo "DEBUG: MODELS_DIR = ${MODELS_DIR}"

#### HELPERS ####
random_sample() {
  python src/intervention_sampling/neural_networks/random_sample.py "$@"
}

export BASE_DIR
export IDENTIFIER
export AUTOMATON
export DATA_DIR
export MODELS_DIR
export INTERVENTION_START
export INTERVENTION_END
export INTERVENTION_STEP
export MAX_JOBS
export NUM_SAMPLES
export MAIN_SEED
export NUM_STATES
export NUM_SYMBOLS
export ACCEPTANCE_PROB

run_single_job() {
  local TYPE="$1"
  local SEMIRING="$2"
  local SEED="$3"     # The fixed seed (123) from sampling script
  local INTERVENTION="$4"
  local TARGET="$5"
  local IC="$6"       # Intervention count (50, 250, 450, etc)
  local ARCH="$7"
  local MSEED="$8"    # Model seed (for training)
  local TOP_SEED="${9:-1}"  # Topology seed
  
  # Make sure IDENTIFIER is defined
  if [[ -z "${IDENTIFIER}" ]]; then
    echo "ERROR: IDENTIFIER is not defined!"
    echo "Using hardcoded value: 40st_10sym_redo"
    IDENTIFIER="40st_10sym_redo"
  fi
  
  # Set paths based on data type
  if [[ $TYPE == "vanilla" ]]; then
    # Handle vanilla data with correct path structure
    # Since 'i' was undefined in the sampling script for vanilla,
    # the train/val directories should be directly under TOP_SEED
    base="${DATA_DIR}/vanilla/${IDENTIFIER}/${SEED}/${TOP_SEED}"
    train_dir="${base}/train/1250"
    val_dir="${base}/validation/1250"
    test_dir="${base}/test"
    id="vanilla_top${TOP_SEED}"
  else
    # Handle intervention data - include IDENTIFIER in the path
    base="${DATA_DIR}/${SEMIRING}/${IDENTIFIER}/${SEED}/${TOP_SEED}/${INTERVENTION}/${TARGET}"
    train_dir="${base}/train/${IC}"
    val_dir="${base}/validation/${IC}"
    test_dir="${base}/test"
    id="${SEMIRING}_${INTERVENTION}_target${TARGET}_top${TOP_SEED}_ic${IC}"
  fi
  
  # Print the paths for debugging
  echo "DEBUG: IDENTIFIER = ${IDENTIFIER}"
  echo "DEBUG: train_dir = ${train_dir}"
  echo "DEBUG: val_dir = ${val_dir}"
  echo "DEBUG: test_dir = ${test_dir}"

  # Check if training data exists
  echo "Checking if training data exists at $train_dir/main.tok"
  if [[ ! -d "$train_dir" ]]; then
    echo "⚠️  [$TYPE] Training directory does not exist: $train_dir"
    return 0
  fi
  
  if [[ ! -f "$train_dir/main.tok" ]]; then
    echo "⚠️  [$TYPE] Training data file not found: $train_dir/main.tok"
    return 0
  fi
  
  # Check if validation data exists
  echo "Checking if validation data exists at $val_dir/main.tok"
  if [[ ! -d "$val_dir" ]]; then
    echo "⚠️  [$TYPE] Validation directory does not exist: $val_dir"
    return 0
  fi
  
  if [[ ! -f "$val_dir/main.tok" ]]; then
    echo "⚠️  [$TYPE] Validation data file not found: $val_dir/main.tok"
    return 0
  fi
  
  # Check if test directory exists - we'll create it if not
  echo "Checking if test directory exists at $test_dir"
  if [[ ! -d "$test_dir" ]]; then
    echo "📁 Creating test directory: $test_dir"
    mkdir -p "$test_dir"
  fi

  echo "➡️  [$TYPE] arch=$ARCH | seed=$SEED | top=$TOP_SEED | target=${TARGET:-n/a} | ic=${IC} | model_seed=$MSEED"

  # Add jitter to avoid races when creating directories
  sleep "$(awk -v min=0.1 -v max=3 'BEGIN{srand(); print min+rand()*(max-min)}')"

  # Make sure the test directory exists
  mkdir -p "${test_dir}"
  
  # Check if vocabulary file exists and create it if needed
  if [[ ! -f "${test_dir}/main.vocab" ]]; then
    echo "🔄 Vocabulary file not found, attempting to create it..."
    
    # Add lock mechanism to prevent race conditions in parallel execution
    LOCK_FILE="${test_dir}/.vocab_generation.lock"
    
    # Try to acquire lock
    if mkdir "$LOCK_FILE" 2>/dev/null; then
      echo "🔒 Acquired lock for vocabulary generation"
      
      # Double-check if another process created the file while we were waiting
      if [[ ! -f "${test_dir}/main.vocab" ]]; then
        echo "📝 Generating vocabulary file from tokenized data..."
        
        # Run the data preparation script to generate vocabulary
        python src/rau/tasks/language_modeling/prepare_data.py \
              --more-data-files "$train_dir"/main.{tok,prepared} \
              --more-data-files "$val_dir"/main.{tok,prepared} \
              --training-data "$test_dir" \
              --vocabulary-file "${test_dir}/main.vocab" \
              --never-allow-unk
        
        # Check if the vocabulary file was created successfully
        if [[ ! -f "${test_dir}/main.vocab" ]]; then
          echo "❌ Failed to create vocabulary file"
          rmdir "$LOCK_FILE"
          return 1
        fi
        
        echo "✅ Successfully created vocabulary file"
      else
        echo "📋 Vocabulary file was created by another process while waiting"
      fi
      
      # Release lock
      rmdir "$LOCK_FILE"
    else
      echo "⏳ Waiting for another process to generate the vocabulary file..."
      
      # Wait for the other process to finish (with timeout)
      TIMEOUT=300  # 5 minutes
      START_TIME=$(date +%s)
      
      while [[ ! -f "${test_dir}/main.vocab" ]]; do
        CURRENT_TIME=$(date +%s)
        ELAPSED=$((CURRENT_TIME - START_TIME))
        
        if [[ $ELAPSED -gt $TIMEOUT ]]; then
          echo "⚠️ Timeout waiting for vocabulary file to be created"
          return 1
        fi
        
        # Check if lock still exists
        if [[ ! -d "$LOCK_FILE" ]]; then
          # Lock was released but vocab still doesn't exist - something went wrong
          if [[ ! -f "${test_dir}/main.vocab" ]]; then
            echo "❌ Lock was released but vocabulary file wasn't created"
            return 1
          fi
        fi
        
        sleep 5
      done
      
      echo "📋 Vocabulary file is now available"
    fi
  fi

  # Get architecture-specific flags
  read -r -a ARCH_ARGS <<< "$(
    python src/intervention_sampling/neural_networks/get_architecture_args.py \
      --architecture "$ARCH" \
      --parameter-budget 128000 \
      --vocabulary-file "${test_dir}/main.vocab"
  )"

  # Fixed hyperparameters
  max_tokens=256  # A reasonable middle value
  lr=0.01         # A typical learning rate

  # Set output directory with complete path structure
  if [[ $TYPE == "vanilla" ]]; then
    rel_path="vanilla/${SEED}/${TOP_SEED}"
  else
    rel_path="${SEMIRING}/${SEED}/${TOP_SEED}/${INTERVENTION}/${TARGET}/${IC}"
  fi
  
  output_dir="${MODELS_DIR}/${ARCH}/${rel_path}/${MSEED}"
  mkdir -p "${output_dir}"
  
  # Run the prepare data script
  echo "🔧 Running data preparation..."
  python src/rau/tasks/language_modeling/prepare_data.py \
        --more-data-files "$train_dir"/main.{tok,prepared} \
        --more-data-files "$val_dir"/main.{tok,prepared} \
        --training-data "$test_dir" \
        --vocabulary-file "${test_dir}/main.vocab" \
        --never-allow-unk

  # ─── TRAIN ────────────────────────────────────────────────────────────
  if [[ -d "${output_dir}" && -f "${output_dir}/model.pt" ]]; then
    echo "💾 Model already exists at ${output_dir}, skipping training."
  else
    echo "🏋️ Training model..."
    mkdir -p "${output_dir}"
    
    python src/rau/tasks/language_modeling/train_kl.py \
      --training-data-file   "${train_dir}/main.prepared" \
      --validation-data-file "${val_dir}/main.prepared" \
      --vocabulary-file      "${test_dir}/main.vocab" \
      --output               "${output_dir}" \
      --architecture         "${ARCH}" \
      "${ARCH_ARGS[@]}" \
      --init-scale 0.1 \
      --max-epochs 1000 \
      --max-tokens-per-batch "${max_tokens}" \
      --optimizer Adam \
      --initial-learning-rate "${lr}" \
      --gradient-clipping-threshold 5 \
      --early-stopping-patience 100 \
      --learning-rate-patience 5 \
      --learning-rate-decay-factor 0.5 \
      --examples-per-checkpoint 1000 \
      --no-progress \
      --automaton "${train_dir}/machine.pkl" \
      --device cpu
  fi

  # ─── EVAL ─────────────────────────────────────────────────────────────
  eval_dir="${output_dir}/eval"
  eval_output="${eval_dir}/token-negative-log-probabilities.pt"
  
  if [[ -d "${eval_dir}" && -f "${eval_output}" ]]; then
    echo "💾 Evaluation output already exists at ${eval_output}, skipping evaluation."
  else
    echo "🔍 Running evaluation..."
    mkdir -p "${eval_dir}"
    
    python src/intervention_sampling/neural_networks/evaluate.py \
      --batching-max-tokens 1024 \
      --load-model "${output_dir}" \
      --input-file "${test_dir}/main.prepared" \
      --output    "${eval_dir}"
  fi

  # ─── KL DECOMP ────────────────────────────────────────────────────────
  kl_output="${eval_dir}/kl_results.json"
  
  if [[ -f "${kl_output}" ]]; then
    echo "💾 KL decomposition output already exists at ${kl_output}, skipping."
  else
    echo "📊 Running KL decomposition..."
    
    # Check if the --output_file parameter is supported
    if python -c "import sys; sys.exit(0 if '--output_file' in open('src/intervention_sampling/evaluate_kl.py').read() else 1)" 2>/dev/null; then
      output_arg=(--output_file "${kl_output}")
    else
      output_arg=()
    fi
    
    python src/intervention_sampling/evaluate_kl.py \
      --model_logprobs "${eval_output}" \
      --automaton       "${train_dir}/machine.pkl" \
      --arcs            "${test_dir}/arcs.txt" \
      --vocab_file      "${test_dir}/main.vocab" \
      "${output_arg[@]}"
  fi

  echo "✅ Completed [$TYPE] arch=$ARCH | am_idx=$AM_IDX | top=$TOP_SEED | target=${TARGET:-n/a} | ic=$IC | model_seed=$MSEED"
}

export -f random_sample run_single_job

#### BUILD & SHUFFLE JOB LIST ####
JOB_LIST="job_list.txt"
rm -f "${JOB_LIST}"

# Print debugging information about the IDENTIFIER at the beginning
echo "===================== DEBUG INFO ====================="
echo "IDENTIFIER: ${IDENTIFIER}"
echo "BASE_DIR: ${BASE_DIR}"
echo "DATA_DIR: ${DATA_DIR}"
echo "======================================================"

# Generate jobs for vanilla data (100 topologies)
# Format: TYPE SEMIRING SEED INTERVENTION TARGET IC ARCH MSEED TOP_SEED
echo "Generating vanilla jobs with 100 topologies..."
for TOP_SEED in {1..400}; do
  for ARCH in lstm transformer; do
    for MSEED in {1..5}; do
      # For vanilla jobs, the IC parameter isn't used, but we'll keep it in the format for consistency
      echo "vanilla none ${MAIN_SEED} none none 0 ${ARCH} ${MSEED} ${TOP_SEED}" >> "${JOB_LIST}"
    done
  done
done

# Generate jobs for intervention data (10 topologies)
echo "Generating intervention jobs with 10 topologies..."
for SEMIRING in alo; do
  for TOP_SEED in {1..10}; do
    for INTERVENTION in state; do
      # For state intervention, targets 1-9 (skip 0)
      if [[ $INTERVENTION == "state" ]]; then
        for TARGET in {1..9}; do
          for IC in $(seq ${INTERVENTION_START} ${INTERVENTION_STEP} ${INTERVENTION_END}); do
            for ARCH in lstm transformer; do
              for MSEED in {1..5}; do
                echo "intervention ${SEMIRING} ${MAIN_SEED} ${INTERVENTION} ${TARGET} ${IC} ${ARCH} ${MSEED} ${TOP_SEED}" >> "${JOB_LIST}"
              done
            done
          done
        done
      fi
    done
  done
done

echo "🔀 Shuffling jobs..."
sort -R "${JOB_LIST}" -o "${JOB_LIST}"

TOTAL=$(wc -l < "${JOB_LIST}")
echo "📋 Total jobs to run: ${TOTAL}"

#### LAUNCH ####
echo "🚀 Launching up to ${MAX_JOBS} concurrent jobs..."
parallel --progress --jobs "${MAX_JOBS}" --colsep ' ' --joblog parallel_job.log \
  --will-cite run_single_job {1} {2} {3} {4} {5} {6} {7} {8} {9} :::: "${JOB_LIST}"

echo "🎉 All training jobs done."