#!/bin/bash

# Script to run all experiments sequentially
# This script will iterate through all config files in recipe/experiments/ and run training

set -e  # Exit on any error

# Configuration
EXPERIMENT_DIR="recipe/experiments"
SCRIPT_PATH="script/train/SFT_train.py"
CUDA_DEVICES="3,4,6,7"
NPROC_PER_NODE=4

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

echo -e "${BLUE}Starting experiment batch run...${NC}"
echo -e "${BLUE}Experiment directory: ${EXPERIMENT_DIR}${NC}"
echo -e "${BLUE}CUDA devices: ${CUDA_DEVICES}${NC}"
echo -e "${BLUE}Processes per node: ${NPROC_PER_NODE}${NC}"
echo ""

# Check if experiment directory exists
if [ ! -d "$EXPERIMENT_DIR" ]; then
    echo -e "${RED}Error: Experiment directory $EXPERIMENT_DIR does not exist!${NC}"
    echo -e "${YELLOW}Please run 'python script/generate_configs.py' first to generate configurations.${NC}"
    exit 1
fi

# Get all config files
CONFIG_FILES=($(find "$EXPERIMENT_DIR" -name "*.json" -not -name "experiment_summary.txt" | sort))

if [ ${#CONFIG_FILES[@]} -eq 0 ]; then
    echo -e "${RED}Error: No config files found in $EXPERIMENT_DIR${NC}"
    echo -e "${YELLOW}Please run 'python script/generate_configs.py' first to generate configurations.${NC}"
    exit 1
fi

echo -e "${GREEN}Found ${#CONFIG_FILES[@]} configuration files to run:${NC}"
for config in "${CONFIG_FILES[@]}"; do
    echo -e "  - $(basename "$config")"
done
echo ""

# Create log directory
LOG_DIR="logs/experiments"
mkdir -p "$LOG_DIR"

# Function to run a single experiment
run_experiment() {
    local config_file=$1
    local config_name=$(basename "$config_file" .json)
    local log_file="$LOG_DIR/${config_name}.log"
    
    echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')] Starting experiment: $config_name${NC}"
    echo -e "${BLUE}Config: $config_file${NC}"
    echo -e "${BLUE}Log file: $log_file${NC}"
    
    # Set CUDA devices
    export CUDA_VISIBLE_DEVICES="$CUDA_DEVICES"
    
    # Run the experiment
    if torchrun --nproc_per_node="$NPROC_PER_NODE" "$SCRIPT_PATH" --config "$config_file" > "$log_file" 2>&1; then
        echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] ✓ Completed: $config_name${NC}"
        return 0
    else
        echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ✗ Failed: $config_name${NC}"
        echo -e "${RED}Check log file: $log_file${NC}"
        return 1
    fi
}

# Main execution loop
total_experiments=${#CONFIG_FILES[@]}
completed_experiments=0
failed_experiments=0
failed_configs=()

echo -e "${YELLOW}Starting batch execution of $total_experiments experiments...${NC}"
echo ""

for config_file in "${CONFIG_FILES[@]}"; do
    config_name=$(basename "$config_file" .json)
    
    echo -e "${YELLOW}Progress: $((completed_experiments + 1))/$total_experiments${NC}"
    
    if run_experiment "$config_file"; then
        ((completed_experiments++))
    else
        ((failed_experiments++))
        failed_configs+=("$config_name")
    fi
    
    echo ""
    
    # Optional: Add delay between experiments to prevent system overload
    if [ $((completed_experiments + failed_experiments)) -lt $total_experiments ]; then
        echo -e "${BLUE}Waiting 30 seconds before next experiment...${NC}"
        sleep 30
    fi
done

# Final summary
echo -e "${BLUE}============================================${NC}"
echo -e "${BLUE}Batch execution completed!${NC}"
echo -e "${BLUE}============================================${NC}"
echo -e "${GREEN}✓ Completed: $completed_experiments/$total_experiments${NC}"
echo -e "${RED}✗ Failed: $failed_experiments/$total_experiments${NC}"

if [ $failed_experiments -gt 0 ]; then
    echo -e "${RED}Failed experiments:${NC}"
    for failed_config in "${failed_configs[@]}"; do
        echo -e "${RED}  - $failed_config${NC}"
    done
fi

echo -e "${BLUE}Logs saved in: $LOG_DIR${NC}"
echo -e "${BLUE}Experiment completed at: $(date)${NC}"

# Exit with appropriate code
if [ $failed_experiments -gt 0 ]; then
    exit 1
else
    exit 0
fi 