#!/bin/bash

# Define the base data directory
BASE_DATA_DIR="/code/jiateng-sandbox/Inject_Complex_Policies/Data_Synthesizer/Sample_Data/"

# Define target data directory
TARGET_DATA_DIR="/code/jiateng-sandbox/intern_project/third_party/LLaMA-Factory/data"

echo "=== CPT Data Generation Script ==="
echo "This script will process all Generated_data directories and create CPT datasets."
echo "For each directory, two dataset variants will be generated:"
echo "  - '_less' variant: 5K task samples, 1K layer samples per layer"
echo "  - '_more' variant: 8K task samples, 2K layer samples per layer"
echo ""

# Create target directory if it doesn't exist
mkdir -p "$TARGET_DATA_DIR"

# Clean the data directory first (remove any existing CPT data files)
echo "Cleaning existing CPT data files..."
if [ -f "$TARGET_DATA_DIR/dataset_info.json" ]; then
    # Remove only the new_cpt_data entries, keeping other existing data
    python3 -c "
import json
import os

dataset_info_path = '$TARGET_DATA_DIR/dataset_info.json'
if os.path.exists(dataset_info_path):
    with open(dataset_info_path, 'r') as f:
        data = json.load(f)
    
    # Remove entries that end with '_new_cpt_data', '_new_cpt_data_less', or '_new_cpt_data_more'
    keys_to_remove = [k for k in data.keys() if k.endswith('_new_cpt_data') or k.endswith('_new_cpt_data_less') or k.endswith('_new_cpt_data_more')]
    for key in keys_to_remove:
        print(f'Removing dataset entry: {key}')
        del data[key]
        # Also remove the corresponding JSON file
        json_file = os.path.join('$TARGET_DATA_DIR', f'{key}.json')
        if os.path.exists(json_file):
            os.remove(json_file)
            print(f'Removed file: {json_file}')
    
    # Save cleaned dataset_info
    with open(dataset_info_path, 'w') as f:
        json.dump(data, f, indent=2)
    print('Cleaned dataset_info.json')
else:
    print('No existing dataset_info.json found, starting fresh')
"
else
    echo "No existing dataset_info.json found, starting fresh"
fi

echo ""

# Function to process a directory
process_directory() {
    local dir_path="$1"
    local dir_name=$(basename "$dir_path")
    
    echo "Processing directory: $dir_name"
    echo "Path: $dir_path"
    
    # Check if directory exists
    if [ ! -d "$dir_path" ]; then
        echo "Error: Directory '$dir_path' does not exist"
        return 1
    fi
    
    # Call the Python script
    python generate.py "$dir_path"
    
    if [ $? -eq 0 ]; then
        echo "✓ Successfully processed $dir_name"
    else
        echo "✗ Failed to process $dir_name"
        return 1
    fi
    
    echo ""
}

# Find all directories starting with "Generated_data" in the base data directory
echo "Finding all Generated_data directories..."
GENERATED_DIRS=($(find "$BASE_DATA_DIR" -maxdepth 1 -type d -name "Generated_data_layer_3_task_5_structure_3*" | sort))

if [ ${#GENERATED_DIRS[@]} -eq 0 ]; then
    echo "Error: No Generated_data directories found in $BASE_DATA_DIR"
    exit 1
fi

echo "Found ${#GENERATED_DIRS[@]} Generated_data directories:"
for dir in "${GENERATED_DIRS[@]}"; do
    echo "  - $(basename "$dir")"
done
echo ""

# Process all Generated_data directories
echo "Starting CPT data generation for all Generated_data directories..."
echo "Each directory will generate 2 dataset variants (_less and _more)..."
echo ""

PROCESSED_DATASETS=()
FAILED_DIRS=()

for dir in "${GENERATED_DIRS[@]}"; do
    if process_directory "$dir"; then
        # Each successful directory now generates two datasets: _less and _more
        PROCESSED_DATASETS+=("$(basename "$dir")_new_cpt_data_less")
        PROCESSED_DATASETS+=("$(basename "$dir")_new_cpt_data_more")
    else
        FAILED_DIRS+=("$(basename "$dir")")
    fi
done

echo "=== CPT Data Generation Complete ==="

if [ ${#FAILED_DIRS[@]} -gt 0 ]; then
    echo "⚠️  Some directories failed to process:"
    for failed_dir in "${FAILED_DIRS[@]}"; do
        echo "  - $failed_dir"
    done
    echo ""
fi

if [ ${#PROCESSED_DATASETS[@]} -gt 0 ]; then
    PROCESSED_DIRS=$((${#PROCESSED_DATASETS[@]} / 2))
    echo "✓ Successfully processed $PROCESSED_DIRS directories!"
    echo "✓ Generated ${#PROCESSED_DATASETS[@]} total datasets (2 variants per directory)!"
    echo ""
    echo "Generated datasets:"
    for i in "${!PROCESSED_DATASETS[@]}"; do
        dataset_name="${PROCESSED_DATASETS[$i]}"
        if [[ "$dataset_name" == *"_less" ]]; then
            echo "$((i+1)). $dataset_name (5K task samples, 1K layer samples)"
        else
            echo "$((i+1)). $dataset_name (8K task samples, 2K layer samples)"
        fi
    done
    echo ""
    echo "Data location: $TARGET_DATA_DIR"
    echo "Dataset info: $TARGET_DATA_DIR/dataset_info.json"
    echo ""
    echo "You can now use these datasets for CPT training with LLaMA-Factory!"
    echo "Choose '_less' variants for lighter training or '_more' variants for intensive training."
else
    echo "❌ No datasets were successfully generated."
    exit 1
fi
