#!/usr/bin/env python3
"""
Script to reformat high_level_codes.parquet to comply with comprehensive pipeline config.

This script converts the current high-level codes format to the expected format
with a 'tag' column that the pipeline can use.
"""

import pandas as pd
import os
import sys
import argparse
from typing import List, Dict, Any

def reformat_high_level_codes(input_path: str, output_path: str) -> None:
    """
    Reformat high-level codes parquet file to have the expected 'tag' column.
    
    Args:
        input_path: Path to the original high_level_codes.parquet file
        output_path: Path where the reformatted file should be saved
    """
    print(f"📂 Reading high-level codes from: {input_path}")
    
    # Read the original parquet file
    df = pd.read_parquet(input_path)
    
    print(f"📊 Original structure:")
    print(f"   Shape: {df.shape}")
    print(f"   Columns: {list(df.columns)}")
    print(f"   Sample data:")
    print(df.head(2))
    
    # Create the reformatted DataFrame
    reformatted_data = []
    
    for _, row in df.iterrows():
        # Extract the high-level code from the 'high_level_code' column
        high_level_code = row['high_level_code']
        
        # Create a record with the expected 'tag' column
        record = {
            'tag': high_level_code,
            'cluster_id': row.get('cluster_id', ''),
            'source_codes': row.get('source_codes', []),
            'num_source_codes': row.get('num_source_codes', 0)
        }
        reformatted_data.append(record)
    
    # Create new DataFrame
    reformatted_df = pd.DataFrame(reformatted_data)
    
    print(f"\n📊 Reformatted structure:")
    print(f"   Shape: {reformatted_df.shape}")
    print(f"   Columns: {list(reformatted_df.columns)}")
    print(f"   Sample data:")
    print(reformatted_df.head(2))
    
    # Save the reformatted file
    print(f"\n💾 Saving reformatted file to: {output_path}")
    reformatted_df.to_parquet(output_path, index=False)
    
    print(f"✅ Successfully reformatted {len(reformatted_df)} high-level codes")
    print(f"✅ File saved to: {output_path}")

def main():
    """Main function to handle command line arguments and execute reformatting."""
    parser = argparse.ArgumentParser(
        description="Reformat high-level codes parquet file for pipeline compatibility",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Reformat with custom input path, output in same directory
  python reformat_high_level_codes.py --input /path/to/high_level_codes.parquet
  
  # Reformat with custom input and output paths
  python reformat_high_level_codes.py --input /path/to/input.parquet --output /path/to/output.parquet
  
  # Reformat with default input path (current directory)
  python reformat_high_level_codes.py
        """
    )
    
    parser.add_argument('--input', '-i', 
                       help='Path to input parquet file (default: high_level_codes.parquet in current directory)')
    parser.add_argument('--output', '-o',
                       help='Path to output parquet file (default: high_level_codes_reformatted.parquet in same directory as input)')
    
    args = parser.parse_args()
    
    # Set default input path if not provided
    if not args.input:
        default_input = "high_level_codes.parquet"
        if os.path.exists(default_input):
            input_path = default_input
            print(f"📁 Using default input file: {input_path}")
        else:
            print(f"❌ Error: No input file specified and default file '{default_input}' not found in current directory")
            print(f"💡 Use --input to specify the path to your high_level_codes.parquet file")
            sys.exit(1)
    else:
        input_path = args.input
    
    # Set default output path if not provided
    if not args.output:
        input_dir = os.path.dirname(input_path) if os.path.dirname(input_path) else "."
        input_filename = os.path.basename(input_path)
        name, ext = os.path.splitext(input_filename)
        output_path = os.path.join(input_dir, f"{name}_reformatted{ext}")
    else:
        output_path = args.output
    
    # Check if input file exists
    if not os.path.exists(input_path):
        print(f"❌ Error: Input file not found: {input_path}")
        sys.exit(1)
    
    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(output_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"📁 Created output directory: {output_dir}")
    
    try:
        reformat_high_level_codes(input_path, output_path)
    except Exception as e:
        print(f"❌ Error during reformatting: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()
