#!/bin/bash

# Define the base data directory
BASE_DATA_DIR="/code/jiateng-sandbox/Inject_Complex_Policies/Data_Synthesizer/Sample_Data/"

# Define target data directory
TARGET_DATA_DIR="/code/jiateng-sandbox/intern_project/third_party/LLaMA-Factory/data"

echo "=== New Three-Dataset Generation Script ==="
echo "This script will process all Generated_data directories and create THREE separate datasets:"
echo "  1. Paraphrasing CPT data (Policy QAs + duplicates)"
echo "  2. Scenario-specific CPT data (Task QAs + Profile comparisons)"
echo "  3. Scenario-specific SFT data (Task QAs + Profile comparisons in conversation format)"
echo ""

# Create target directory if it doesn't exist
mkdir -p "$TARGET_DATA_DIR"

# Clean the data directory first (remove any existing new_data files)
echo "Cleaning existing new_data files..."
if [ -f "$TARGET_DATA_DIR/dataset_info.json" ]; then
    # Remove only the new_data entries, keeping other existing data
    python3 -c "
import json
import os

dataset_info_path = '$TARGET_DATA_DIR/dataset_info.json'
if os.path.exists(dataset_info_path):
    with open(dataset_info_path, 'r') as f:
        data = json.load(f)
    
    # Remove entries that contain '_new_data'
    keys_to_remove = [k for k in data.keys() if '_new_data' in k]
    for key in keys_to_remove:
        print(f'Removing dataset entry: {key}')
        del data[key]
        # Also remove the corresponding JSON file
        json_file = os.path.join('$TARGET_DATA_DIR', f'{key}.json')
        if os.path.exists(json_file):
            os.remove(json_file)
            print(f'Removed file: {json_file}')
    
    # Save cleaned dataset_info
    with open(dataset_info_path, 'w') as f:
        json.dump(data, f, indent=2)
    print('Cleaned dataset_info.json')
else:
    print('No existing dataset_info.json found, starting fresh')
"
else
    echo "No existing dataset_info.json found, starting fresh"
fi

echo ""

# Function to process a directory
process_directory() {
    local dir_path="$1"
    local dir_name=$(basename "$dir_path")
    
    echo "Processing directory: $dir_name"
    echo "Path: $dir_path"
    
    # Check if directory exists
    if [ ! -d "$dir_path" ]; then
        echo "Error: Directory '$dir_path' does not exist"
        return 1
    fi
    
    # Call the new Python script
    python generate_new.py "$dir_path"
    
    if [ $? -eq 0 ]; then
        echo "✓ Successfully processed $dir_name"
    else
        echo "✗ Failed to process $dir_name"
        return 1
    fi
    
    echo ""
}

# Find all directories starting with "Generated_data" in the base data directory
echo "Finding all Generated_data directories..."
GENERATED_DIRS=($(find "$BASE_DATA_DIR" -maxdepth 1 -type d -name "Generated_data_layer_3_task_5_structure_3*" | sort))

if [ ${#GENERATED_DIRS[@]} -eq 0 ]; then
    echo "Error: No Generated_data directories found in $BASE_DATA_DIR"
    exit 1
fi

echo "Found ${#GENERATED_DIRS[@]} Generated_data directories:"
for dir in "${GENERATED_DIRS[@]}"; do
    echo "  - $(basename "$dir")"
done
echo ""

# Process all Generated_data directories
echo "Starting dataset generation for all Generated_data directories..."
echo "Each directory will generate 3 datasets: paraphrasing (CPT), scenario_specific (CPT), scenario_specific_sft (SFT)..."
echo ""

PROCESSED_DATASETS=()
FAILED_DIRS=()

for dir in "${GENERATED_DIRS[@]}"; do
    if process_directory "$dir"; then
        # Each successful directory now generates three datasets
        base_name="$(basename "$dir")_new_data"
        PROCESSED_DATASETS+=("${base_name}_paraphrasing")
        PROCESSED_DATASETS+=("${base_name}_scenario_specific")
        PROCESSED_DATASETS+=("${base_name}_scenario_specific_sft")
    else
        FAILED_DIRS+=("$(basename "$dir")")
    fi
done

echo "=== New Dataset Generation Complete ==="

if [ ${#FAILED_DIRS[@]} -gt 0 ]; then
    echo "⚠️  Some directories failed to process:"
    for failed_dir in "${FAILED_DIRS[@]}"; do
        echo "  - $failed_dir"
    done
    echo ""
fi

if [ ${#PROCESSED_DATASETS[@]} -gt 0 ]; then
    PROCESSED_DIRS=$((${#PROCESSED_DATASETS[@]} / 3))
    echo "✓ Successfully processed $PROCESSED_DIRS directories!"
    echo "✓ Generated ${#PROCESSED_DATASETS[@]} total datasets (3 per directory)!"
    echo ""
    echo "Generated datasets:"
    for i in "${!PROCESSED_DATASETS[@]}"; do
        dataset_name="${PROCESSED_DATASETS[$i]}"
        if [[ "$dataset_name" == *"_paraphrasing" ]]; then
            echo "$((i+1)). $dataset_name (CPT - Policy QAs + duplicates)"
        elif [[ "$dataset_name" == *"_scenario_specific_sft" ]]; then
            echo "$((i+1)). $dataset_name (SFT - Task QAs + Profile comparisons)"
        elif [[ "$dataset_name" == *"_scenario_specific" ]]; then
            echo "$((i+1)). $dataset_name (CPT - Task QAs + Profile comparisons)"
        fi
    done
    echo ""
    echo "Data location: $TARGET_DATA_DIR"
    echo "Dataset info: $TARGET_DATA_DIR/dataset_info.json"
    echo ""
    echo "Dataset types:"
    echo "  - *_paraphrasing: CPT training on policy understanding"
    echo "  - *_scenario_specific: CPT training on task execution"
    echo "  - *_scenario_specific_sft: SFT training on task execution"
    echo ""
    echo "You can now use these datasets for training with LLaMA-Factory!"
else
    echo "❌ No datasets were successfully generated."
    exit 1
fi 