#!/bin/bash

# SAT Feature Extraction One-Click Script
# Extract compact features from all specified datasets and perform global normalization

set -e  # Exit immediately on error

# Configuration parameters
DATASETS=("train" "CoinsGrid_cnf" "CNP" "KnightTour" "LangFord_cnf" "PRP_cnf" "SCPC" "Zamkeller_cnf")
MAX_WORKERS=4
FEATURES_DIR="features"
NORMALIZED_DIR="normalized_features"
LOG_FILE="feature_extraction_$(date +%Y%m%d_%H%M%S).log"

# Color output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Logging functions
log() {
    echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" | tee -a "$LOG_FILE"
}

log_success() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] ✓${NC} $1" | tee -a "$LOG_FILE"
}

log_error() {
    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ✗${NC} $1" | tee -a "$LOG_FILE"
}

log_warning() {
    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] ⚠${NC} $1" | tee -a "$LOG_FILE"
}

# Check dependencies
check_dependencies() {
    log "Checking dependencies and environment..."
    
    if ! command -v python &> /dev/null; then
        log_error "Python not found"
        exit 1
    fi
    
    if [ ! -f "extract_compact_features.py" ]; then
        log_error "extract_compact_features.py not found"
        exit 1
    fi
    
    if [ ! -f "normalize_features.py" ]; then
        log_error "normalize_features.py not found"
        exit 1
    fi
    
    if [ ! -d "dataset" ]; then
        log_error "dataset directory not found"
        exit 1
    fi
    
    log_success "Dependencies check passed"
}

# Check datasets
check_datasets() {
    log "Checking datasets..."
    
    for dataset in "${DATASETS[@]}"; do
        if [ -d "dataset/$dataset" ]; then
            file_count=$(find "dataset/$dataset" -name "*.cnf" | wc -l)
            log_success "Found dataset: $dataset ($file_count CNF files)"
        else
            log_warning "Dataset not found: $dataset"
        fi
    done
}

# Clean up old files
cleanup_old_files() {
    log "Cleaning up old feature files..."
    
    if [ -d "$FEATURES_DIR" ]; then
        rm -rf "$FEATURES_DIR"
        log_success "Removed old features directory: $FEATURES_DIR"
    fi
    
    if [ -d "$NORMALIZED_DIR" ]; then
        rm -rf "$NORMALIZED_DIR"
        log_success "Removed old normalized directory: $NORMALIZED_DIR"
    fi
}

# Extract raw features
extract_features() {
    log "Starting feature extraction for all datasets (21 independent features)..."
    log "Datasets: ${DATASETS[*]}"
    log "Parallel workers: $MAX_WORKERS"
    
    # Build command arguments
    dataset_args=""
    for dataset in "${DATASETS[@]}"; do
        dataset_args="$dataset_args $dataset"
    done
    
    # Execute feature extraction (only process specified datasets)
    if python extract_compact_features.py \
        --folders $dataset_args \
        --max-workers $MAX_WORKERS \
        --formats npz 2>&1 | tee -a "$LOG_FILE"; then
        log_success "Raw feature extraction completed"
    else
        log_error "Feature extraction failed"
        exit 1
    fi
}

# Global normalization
normalize_features() {
    log "Starting global feature normalization..."
    
    # Check which datasets successfully extracted features
    available_datasets=()
    for dataset in "${DATASETS[@]}"; do
        if [ -f "$FEATURES_DIR/$dataset/${dataset}_features.npz" ]; then
            available_datasets+=("$dataset")
            log_success "Confirmed dataset: $dataset"
        else
            log_warning "Skipping dataset (feature file not found): $dataset"
        fi
    done
    
    if [ ${#available_datasets[@]} -eq 0 ]; then
        log_error "No available feature files for normalization"
        exit 1
    fi
    
    log "Will normalize ${#available_datasets[@]} datasets: ${available_datasets[*]}"
    
    # Build normalization command arguments
    normalize_args=""
    for dataset in "${available_datasets[@]}"; do
        normalize_args="$normalize_args $dataset"
    done
    
    # Execute normalization
    if python normalize_features.py \
        --datasets $normalize_args \
        --method standard \
        --name all_datasets_standard \
        --show-stats 2>&1 | tee -a "$LOG_FILE"; then
        log_success "Feature normalization completed"
    else
        log_error "Feature normalization failed"
        exit 1
    fi
}

# Generate summary report
generate_summary() {
    log "Generating summary report..."
    
    echo "" | tee -a "$LOG_FILE"
    echo "======================================" | tee -a "$LOG_FILE"
    echo "      Feature Extraction Report" | tee -a "$LOG_FILE"
    echo "======================================" | tee -a "$LOG_FILE"
    
    # Raw feature statistics
    if [ -d "$FEATURES_DIR" ]; then
        echo "Raw feature files:" | tee -a "$LOG_FILE"
        for dataset_dir in "$FEATURES_DIR"/*; do
            if [ -d "$dataset_dir" ]; then
                dataset_name=$(basename "$dataset_dir")
                npz_file="$dataset_dir/${dataset_name}_features.npz"
                if [ -f "$npz_file" ]; then
                    file_size=$(du -h "$npz_file" | cut -f1)
                    echo "  ✓ $dataset_name: $file_size" | tee -a "$LOG_FILE"
                fi
            fi
        done
    fi
    
    # Normalized feature statistics
    if [ -d "$NORMALIZED_DIR" ]; then
        echo "" | tee -a "$LOG_FILE"
        echo "Normalized feature files:" | tee -a "$LOG_FILE"
        for norm_file in "$NORMALIZED_DIR"/*_normalized_*.npz; do
            if [ -f "$norm_file" ]; then
                file_size=$(du -h "$norm_file" | cut -f1)
                filename=$(basename "$norm_file")
                echo "  ✓ $filename: $file_size" | tee -a "$LOG_FILE"
            fi
        done
        
        # Normalizer file
        if [ -f "$NORMALIZED_DIR/normalizer_all_datasets_standard.pkl" ]; then
            echo "  ✓ Normalizer: normalizer_all_datasets_standard.pkl" | tee -a "$LOG_FILE"
        fi
    fi
    
    echo "" | tee -a "$LOG_FILE"
    echo "Usage:" | tee -a "$LOG_FILE"
    echo "  Raw features: features/<dataset>/<dataset>_features.npz" | tee -a "$LOG_FILE"
    echo "  Normalized features: normalized_features/<dataset>_normalized_all_datasets_standard.npz" | tee -a "$LOG_FILE"
    echo "  Combined dataset: normalized_features/combined_normalized_all_datasets_standard.npz" | tee -a "$LOG_FILE"
    echo "" | tee -a "$LOG_FILE"
    echo "Log file: $LOG_FILE" | tee -a "$LOG_FILE"
}

# Main execution flow
main() {
    echo "======================================="
    echo "  SAT Feature Extraction & Normalization"
    echo "======================================="
    echo "Start time: $(date)"
    echo "Log file: $LOG_FILE"
    echo ""
    
    START_TIME=$(date +%s)
    
    # Execute steps
    check_dependencies
    check_datasets
    cleanup_old_files
    extract_features
    normalize_features
    generate_summary
    
    END_TIME=$(date +%s)
    DURATION=$((END_TIME - START_TIME))
    
    log_success "All steps completed! Total time: ${DURATION}s"
    echo ""
    echo "======================================="
    echo "            Execution Complete!"
    echo "======================================="
}

# Signal handling
trap 'log_error "Script interrupted"; exit 1' INT TERM

# Execute main flow
main "$@"