#!/bin/bash

# Generalizable CPT Data Generation Script
# Usage: ./Generate_CPT.sh --policy_file <policy.md> --dataset_files <file1.json> <file2.json> ... --tools_path <tools_dir> --dataset_name <name>

# Default values
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PYTHON_SCRIPT="$SCRIPT_DIR/generate_any.py"
OUTPUT_DIR="/code/jiateng-sandbox/intern_project/third_party/LLaMA-Factory/data"

# Function to display usage
usage() {
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  --policy_file PATH     Path to the policy file (e.g., wiki.md) [REQUIRED]"
    echo "  --dataset_files PATH1 PATH2 ...  Paths to dataset JSON files [REQUIRED]"
    echo "  --tools_path PATH      Path to the tools directory [REQUIRED]"
    echo "  --dataset_name NAME    Name for the output dataset [REQUIRED]"
    echo "  --output_dir PATH      Output directory (default: $OUTPUT_DIR)"
    echo "  --help                 Display this help message"
    echo ""
    echo "Example:"
    echo "  $0 --policy_file /path/to/wiki.md \\"
    echo "     --dataset_files /path/to/flights.json /path/to/users.json /path/to/reservations.json \\"
    echo "     --tools_path /path/to/tools \\"
    echo "     --dataset_name airline_demo"
    echo ""
    echo "Example with tau-bench airline environment:"
    echo "  $0 --policy_file /code/jiateng-sandbox/Inject_Complex_Policies/CPT_Creation/tau-bench/tau_bench/envs/airline/wiki.md \\"
    echo "     --dataset_files /code/jiateng-sandbox/Inject_Complex_Policies/CPT_Creation/tau-bench/tau_bench/envs/airline/data/flights.json \\"
    echo "                     /code/jiateng-sandbox/Inject_Complex_Policies/CPT_Creation/tau-bench/tau_bench/envs/airline/data/reservations.json \\"
    echo "                     /code/jiateng-sandbox/Inject_Complex_Policies/CPT_Creation/tau-bench/tau_bench/envs/airline/data/users.json \\"
    echo "     --tools_path /code/jiateng-sandbox/Inject_Complex_Policies/CPT_Creation/tau-bench/tau_bench/envs/airline/tools \\"
    echo "     --dataset_name airline_tau_bench"
    exit 1
}

# Initialize variables
POLICY_FILE=""
DATASET_FILES=()
TOOLS_PATH=""
DATASET_NAME=""

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --policy_file)
            POLICY_FILE="$2"
            shift 2
            ;;
        --dataset_files)
            shift
            # Collect all dataset files until next option or end
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                DATASET_FILES+=("$1")
                shift
            done
            ;;
        --tools_path)
            TOOLS_PATH="$2"
            shift 2
            ;;
        --dataset_name)
            DATASET_NAME="$2"
            shift 2
            ;;
        --output_dir)
            OUTPUT_DIR="$2"
            shift 2
            ;;
        --help)
            usage
            ;;
        *)
            echo "Unknown option: $1"
            usage
            ;;
    esac
done

# Validate required arguments
if [[ -z "$POLICY_FILE" ]]; then
    echo "Error: --policy_file is required"
    usage
fi

if [[ ${#DATASET_FILES[@]} -eq 0 ]]; then
    echo "Error: --dataset_files is required"
    usage
fi

if [[ -z "$TOOLS_PATH" ]]; then
    echo "Error: --tools_path is required"
    usage
fi

if [[ -z "$DATASET_NAME" ]]; then
    echo "Error: --dataset_name is required"
    usage
fi

# Validate file existence
if [[ ! -f "$POLICY_FILE" ]]; then
    echo "Error: Policy file does not exist: $POLICY_FILE"
    exit 1
fi

for dataset_file in "${DATASET_FILES[@]}"; do
    if [[ ! -f "$dataset_file" ]]; then
        echo "Warning: Dataset file does not exist: $dataset_file"
    fi
done

if [[ ! -d "$TOOLS_PATH" ]]; then
    echo "Warning: Tools directory does not exist: $TOOLS_PATH"
fi

# Validate Python script exists
if [[ ! -f "$PYTHON_SCRIPT" ]]; then
    echo "Error: Python script does not exist: $PYTHON_SCRIPT"
    exit 1
fi

# Display configuration
echo "=========================================="
echo "Generalizable CPT Data Generation"
echo "=========================================="
echo "Policy file:    $POLICY_FILE"
echo "Dataset files:  ${DATASET_FILES[*]}"
echo "Tools path:     $TOOLS_PATH"
echo "Dataset name:   $DATASET_NAME"
echo "Output dir:     $OUTPUT_DIR"
echo "Python script:  $PYTHON_SCRIPT"
echo "=========================================="

# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"

# Build Python command
PYTHON_CMD="python3 $PYTHON_SCRIPT"
PYTHON_CMD+=" --policy_file \"$POLICY_FILE\""
PYTHON_CMD+=" --dataset_files"
for dataset_file in "${DATASET_FILES[@]}"; do
    PYTHON_CMD+=" \"$dataset_file\""
done
PYTHON_CMD+=" --tools_path \"$TOOLS_PATH\""
PYTHON_CMD+=" --dataset_name \"$DATASET_NAME\""
PYTHON_CMD+=" --output_dir \"$OUTPUT_DIR\""

# Log the command for debugging
echo "Executing command:"
echo "$PYTHON_CMD"
echo ""

# Execute the Python script
eval "$PYTHON_CMD"

# Check if the command was successful
if [[ $? -eq 0 ]]; then
    echo ""
    echo "=========================================="
    echo "CPT Data Generation Completed Successfully!"
    echo "=========================================="
    echo "Dataset name: ${DATASET_NAME}_cpt_data"
    echo "Output file: $OUTPUT_DIR/${DATASET_NAME}_cpt_data.json"
    echo "Dataset info: $OUTPUT_DIR/dataset_info.json"
    echo ""
    echo "You can now use this dataset for CPT training!"
else
    echo ""
    echo "=========================================="
    echo "CPT Data Generation Failed!"
    echo "=========================================="
    echo "Please check the error messages above."
    exit 1
fi
