#!/bin/bash

# Batch script to process all parquet files in a directory
# Converts parquet to JSON and generates token statistics for each file

# ===========================================
# EDIT THESE PATHS:
# ===========================================
# Directory containing parquet files to process
INPUT_DIR="/fsx/training/output/user/evaluation_v3/L1_Mentalese/no_budget/Agentica24k_1.5b_mentalese_cot_lr_1e-6_SFT30k_ins_suffix_ckpt_8910_grpo_with_l1_exact_range_64_512_ray_multinode_1024_response/actor/global_step_1500"

# Virtual environment path (edit this to your venv path)
VENV_PATH="/fsx/training/source/user/repos/arrakis/.venv"

# ===========================================
# Script execution (don't edit below this line)
# ===========================================

echo "🚀 Starting Batch Processing of Parquet Files..."
echo "📁 Input Directory: $INPUT_DIR"
echo ""

# Check if input directory exists
if [ ! -d "$INPUT_DIR" ]; then
    echo "❌ Error: Input directory '$INPUT_DIR' does not exist!"
    exit 1
fi

# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Activate virtual environment if specified
if [ "$VENV_PATH" != "path/to/your/venv" ] && [ -d "$VENV_PATH" ]; then
    echo "🔧 Activating virtual environment: $VENV_PATH"
    source "$VENV_PATH/bin/activate"
else
    echo "⚠️  No virtual environment specified or path doesn't exist, using system Python"
fi

# Find all parquet files in the input directory
echo "🔍 Searching for parquet files in: $INPUT_DIR"
PARQUET_FILES=($(find "$INPUT_DIR" -name "*.parquet" -type f))

if [ ${#PARQUET_FILES[@]} -eq 0 ]; then
    echo "❌ No parquet files found in the specified directory!"
    exit 1
fi

echo "📊 Found ${#PARQUET_FILES[@]} parquet file(s):"
for file in "${PARQUET_FILES[@]}"; do
    echo "   - $(basename "$file")"
done
echo ""

# Process each parquet file
for parquet_file in "${PARQUET_FILES[@]}"; do
    echo "=" * 60
    echo "🔄 Processing: $(basename "$parquet_file")"
    echo "=" * 60
    
    # Generate output JSON path (same directory, .json extension)
    json_file="${parquet_file%.parquet}.json"
    
    echo "📥 Converting parquet to JSON..."
    echo "   Input:  $(basename "$parquet_file")"
    echo "   Output: $(basename "$json_file")"
    
    # Run the parquet to JSON conversion
    python3 "$SCRIPT_DIR/simple_parquet_to_json.py" "$parquet_file" "$json_file"
    
    if [ $? -eq 0 ]; then
        echo "✅ Parquet to JSON conversion successful!"
        
        # Check if JSON file was created
        if [ -f "$json_file" ]; then
            echo "📊 Generating token statistics..."
            
            # Run the token analysis
            python3 "$SCRIPT_DIR/analyze_tokens.py" "$json_file"
            
            if [ $? -eq 0 ]; then
                echo "✅ Token analysis completed successfully!"
                
                # Get the generated stats files
                base_name=$(basename "$json_file" .json)
                stats_txt="${json_file%.json}_stats.txt"
                stats_json="${json_file%.json}_stats.json"
                
                echo "📄 Generated files:"
                echo "   - $(basename "$stats_txt")"
                echo "   - $(basename "$stats_json")"
            else
                echo "❌ Token analysis failed for $(basename "$parquet_file")"
            fi
        else
            echo "❌ JSON file was not created for $(basename "$parquet_file")"
        fi
    else
        echo "❌ Parquet to JSON conversion failed for $(basename "$parquet_file")"
    fi
    
    echo ""
done

echo "🎉 Batch processing completed!"
echo ""
echo "📋 Summary of generated files:"
echo "   - JSON files: ${#PARQUET_FILES[@]}"
echo "   - Stats text files: ${#PARQUET_FILES[@]}"
echo "   - Stats JSON files: ${#PARQUET_FILES[@]}"
echo ""
echo "📁 All files are saved in: $INPUT_DIR" 