#!/usr/bin/env python3
"""
Comprehensive script to reformat high_level_codes.parquet to comply with pipeline config.

This script provides multiple reformatting options and can handle various input formats.
"""

import pandas as pd
import os
import sys
import argparse
from typing import List, Dict, Any, Optional

class HighLevelCodesReformatter:
    """Class to handle reformatting of high-level codes parquet files."""
    
    def __init__(self, input_path: str):
        """
        Initialize the reformatter.
        
        Args:
            input_path: Path to the input parquet file
        """
        self.input_path = input_path
        self.df = None
        self._load_data()
    
    def _load_data(self):
        """Load the parquet data."""
        if not os.path.exists(self.input_path):
            raise FileNotFoundError(f"Input file not found: {self.input_path}")
        
        print(f"📂 Loading data from: {self.input_path}")
        self.df = pd.read_parquet(self.input_path)
        print(f"✅ Loaded {len(self.df)} records")
    
    def analyze_structure(self):
        """Analyze the current structure of the data."""
        print(f"\n📊 Data Structure Analysis:")
        print(f"   Shape: {self.df.shape}")
        print(f"   Columns: {list(self.df.columns)}")
        print(f"   Data types:")
        for col, dtype in self.df.dtypes.items():
            print(f"     {col}: {dtype}")
        
        print(f"\n📋 Sample data (first 3 rows):")
        print(self.df.head(3))
        
        # Check for expected columns
        expected_columns = ['tag', 'high_level_code', 'cluster_id']
        missing_columns = [col for col in expected_columns if col not in self.df.columns]
        if missing_columns:
            print(f"\n⚠️  Missing expected columns: {missing_columns}")
        
        return self.df.columns.tolist()
    
    def reformat_basic(self, output_path: str) -> pd.DataFrame:
        """
        Basic reformatting: rename 'high_level_code' to 'tag'.
        
        Args:
            output_path: Path to save the reformatted file
            
        Returns:
            Reformatted DataFrame
        """
        print(f"\n🔄 Performing basic reformatting...")
        
        if 'high_level_code' not in self.df.columns:
            raise ValueError("Column 'high_level_code' not found in the data")
        
        # Create a copy and rename the column
        reformatted_df = self.df.copy()
        reformatted_df = reformatted_df.rename(columns={'high_level_code': 'tag'})
        
        # Reorder columns to put 'tag' first
        columns = ['tag'] + [col for col in reformatted_df.columns if col != 'tag']
        reformatted_df = reformatted_df[columns]
        
        self._save_and_validate(reformatted_df, output_path)
        return reformatted_df
    
    def reformat_minimal(self, output_path: str) -> pd.DataFrame:
        """
        Minimal reformatting: keep only essential columns with 'tag'.
        
        Args:
            output_path: Path to save the reformatted file
            
        Returns:
            Reformatted DataFrame
        """
        print(f"\n🔄 Performing minimal reformatting...")
        
        if 'high_level_code' not in self.df.columns:
            raise ValueError("Column 'high_level_code' not found in the data")
        
        # Keep only essential columns
        essential_columns = ['high_level_code']
        if 'cluster_id' in self.df.columns:
            essential_columns.append('cluster_id')
        
        reformatted_df = self.df[essential_columns].copy()
        reformatted_df = reformatted_df.rename(columns={'high_level_code': 'tag'})
        
        self._save_and_validate(reformatted_df, output_path)
        return reformatted_df
    
    def reformat_expanded(self, output_path: str) -> pd.DataFrame:
        """
        Expanded reformatting: include all original columns plus 'tag'.
        
        Args:
            output_path: Path to save the reformatted file
            
        Returns:
            Reformatted DataFrame
        """
        print(f"\n🔄 Performing expanded reformatting...")
        
        if 'high_level_code' not in self.df.columns:
            raise ValueError("Column 'high_level_code' not found in the data")
        
        # Create a copy and add 'tag' column
        reformatted_df = self.df.copy()
        reformatted_df['tag'] = reformatted_df['high_level_code']
        
        # Reorder columns to put 'tag' first
        columns = ['tag'] + [col for col in reformatted_df.columns if col != 'tag']
        reformatted_df = reformatted_df[columns]
        
        self._save_and_validate(reformatted_df, output_path)
        return reformatted_df
    
    def _save_and_validate(self, df: pd.DataFrame, output_path: str):
        """Save the DataFrame and validate the output."""
        # Create output directory if needed
        output_dir = os.path.dirname(output_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"📁 Created output directory: {output_dir}")
        
        # Save the file
        print(f"💾 Saving reformatted file to: {output_path}")
        df.to_parquet(output_path, index=False)
        
        # Validate the saved file
        print(f"🔍 Validating saved file...")
        validation_df = pd.read_parquet(output_path)
        
        if 'tag' not in validation_df.columns:
            raise ValueError("Validation failed: 'tag' column not found in saved file")
        
        print(f"✅ Successfully reformatted {len(df)} high-level codes")
        print(f"✅ File saved and validated: {output_path}")
        print(f"📊 Final structure:")
        print(f"   Shape: {df.shape}")
        print(f"   Columns: {list(df.columns)}")
        print(f"   Sample 'tag' values:")
        for i, tag in enumerate(df['tag'].head(3)):
            print(f"     {i+1}. {tag[:80]}{'...' if len(tag) > 80 else ''}")

def main():
    """Main function with command line interface."""
    parser = argparse.ArgumentParser(
        description="Reformat high-level codes parquet file for pipeline compatibility",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Analyze structure only
  python comprehensive_reformat_script.py --input /path/to/high_level_codes.parquet --analyze-only
  
  # Basic reformatting with custom input path
  python comprehensive_reformat_script.py --input /path/to/high_level_codes.parquet --mode basic
  
  # Minimal reformatting with custom input and output paths
  python comprehensive_reformat_script.py --input /path/to/input.parquet --output /path/to/output.parquet --mode minimal
  
  # Expanded reformatting with default input (current directory)
  python comprehensive_reformat_script.py --mode expanded
  
  # Use default input file in current directory
  python comprehensive_reformat_script.py
        """
    )
    
    parser.add_argument('--input', '-i', 
                       help='Path to input parquet file (default: high_level_codes.parquet in current directory)')
    parser.add_argument('--output', '-o',
                       help='Path to output parquet file (default: auto-generated based on input path and mode)')
    parser.add_argument('--mode', choices=['basic', 'minimal', 'expanded'], 
                       default='basic', help='Reformatting mode (default: basic)')
    parser.add_argument('--analyze-only', action='store_true', 
                       help='Only analyze the structure, do not reformat')
    
    args = parser.parse_args()
    
    try:
        # Set default input path if not provided
        if not args.input:
            default_input = "high_level_codes.parquet"
            if os.path.exists(default_input):
                input_path = default_input
                print(f"📁 Using default input file: {input_path}")
            else:
                print(f"❌ Error: No input file specified and default file '{default_input}' not found in current directory")
                print(f"💡 Use --input to specify the path to your high_level_codes.parquet file")
                sys.exit(1)
        else:
            input_path = args.input
        
        # Initialize reformatter
        reformatter = HighLevelCodesReformatter(input_path)
        
        # Analyze structure
        columns = reformatter.analyze_structure()
        
        if args.analyze_only:
            print(f"\n✅ Analysis complete. No reformatting performed.")
            return
        
        # Set default output path if not provided
        if not args.output:
            input_dir = os.path.dirname(input_path) if os.path.dirname(input_path) else "."
            input_filename = os.path.basename(input_path)
            name, ext = os.path.splitext(input_filename)
            output_path = os.path.join(input_dir, f"{name}_{args.mode}{ext}")
        else:
            output_path = args.output
        
        # Perform reformatting based on mode
        if args.mode == 'basic':
            reformatter.reformat_basic(output_path)
        elif args.mode == 'minimal':
            reformatter.reformat_minimal(output_path)
        elif args.mode == 'expanded':
            reformatter.reformat_expanded(output_path)
        
        print(f"\n🎉 Reformatting completed successfully!")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()
