#!/usr/bin/env python3
"""
Manual Final Corpus Generator for Schema Induction Pipeline

This script allows you to generate the final corpus for any iteration
without rerunning the entire pipeline. It prioritizes topological sort
results to create a refined final corpus with only the 'tag' column.

Usage:
    python generate_final_corpus.py <iteration_number>
    
Example:
    python generate_final_corpus.py 1
    python generate_final_corpus.py 2
"""

import os
import sys
import pandas as pd
import json
from datetime import datetime

def generate_final_corpus(iteration_number: int):
    """Generate final corpus for a specific iteration using topological sort results"""
    
    print(f'=== GENERATING FINAL CORPUS FOR ITERATION {iteration_number} ===')
    print()
    
    # Check if iteration directory exists
    iteration_dir = f'temp_files/iteration_{iteration_number:02d}'
    if not os.path.exists(iteration_dir):
        print(f'❌ Iteration directory not found: {iteration_dir}')
        return False
    
    print('🔍 CHECKING AVAILABLE CORPUS FILES:')
    
    # Step 1: Try to load topological sort results (preferred method)
    topological_sort_path = os.path.join(iteration_dir, 'topologically_sorted_graph', 'topological_sort.parquet')
    corpus_df = None
    corpus_source = None
    topological_sort_used = False
    
    if os.path.exists(topological_sort_path):
        print(f'🌳 USING TOPOLOGICAL SORT RESULTS:')
        print(f'✅ Found topological sort: {topological_sort_path}')
        
        try:
            topological_sort_df = pd.read_parquet(topological_sort_path)
            print(f'   - Loaded {len(topological_sort_df)} topological sort records')
            print(f'   - Columns: {list(topological_sort_df.columns)}')
            
            # Extract unique codes from topological sort
            if 'node' in topological_sort_df.columns:
                unique_codes = topological_sort_df['node'].unique()
                print(f'   - Found {len(unique_codes)} unique refined codes')
                
                # Create simple corpus with only 'tag' column
                corpus_df = pd.DataFrame({
                    'tag': unique_codes
                })
                corpus_source = 'topological_sort_refined_codes'
                topological_sort_used = True
                
                print(f'   🎯 Created final corpus with {len(corpus_df)} refined codes from topological sort')
                
            else:
                print(f"   ❌ No 'node' column found in topological sort. Columns: {list(topological_sort_df.columns)}")
                corpus_df = None
                
        except Exception as e:
            print(f'   ❌ Error loading topological sort: {e}')
            corpus_df = None
    else:
        print(f'⚠️  Topological sort not found: {topological_sort_path}')
    
    # Step 2: Fallback to original corpus files if topological sort failed
    if corpus_df is None:
        print(f'\\n🔄 FALLBACK: USING ORIGINAL CORPUS FILES:')
        
        enhanced_corpus_path = os.path.join(iteration_dir, 'high_level_codes', 'enhanced_corpus.parquet')
        build_corpus_path = os.path.join(iteration_dir, 'build_corpus', 'corpus.parquet')
        
        if os.path.exists(enhanced_corpus_path):
            print(f'✅ Enhanced corpus: {enhanced_corpus_path}')
            original_corpus = pd.read_parquet(enhanced_corpus_path)
            corpus_source = 'enhanced_corpus_fallback'
            print(f'   - Rows: {len(original_corpus)}')
            print(f'   - Columns: {list(original_corpus.columns)}')
        elif os.path.exists(build_corpus_path):
            print(f'✅ Build corpus: {build_corpus_path}')
            original_corpus = pd.read_parquet(build_corpus_path)
            corpus_source = 'build_corpus_fallback'
            print(f'   - Rows: {len(original_corpus)}')
            print(f'   - Columns: {list(original_corpus.columns)}')
        else:
            print(f'❌ No corpus files found in {iteration_dir}')
            print('   Expected files:')
            print(f'   - {topological_sort_path}')
            print(f'   - {enhanced_corpus_path}')
            print(f'   - {build_corpus_path}')
            return False
        
        # Extract only unique tags for fallback
        if 'tag' in original_corpus.columns:
            unique_tags = original_corpus['tag'].unique()
            corpus_df = pd.DataFrame({
                'tag': unique_tags
            })
            print(f'   🎯 Extracted {len(corpus_df)} unique tags from fallback corpus')
        else:
            print(f"   ❌ No 'tag' column found in corpus. Columns: {list(original_corpus.columns)}")
            return False
    
    print()
    
    # Check if final_corpus already exists
    final_corpus_dir = os.path.join(iteration_dir, 'final_corpus')
    if os.path.exists(final_corpus_dir):
        print(f'⚠️  Final corpus directory already exists: {final_corpus_dir}')
        response = input('   Do you want to overwrite it? (y/N): ').strip().lower()
        if response != 'y':
            print('   Skipping final corpus generation.')
            return False
    
    # Create final_corpus directory
    os.makedirs(final_corpus_dir, exist_ok=True)
    print(f'📁 Created final_corpus directory: {final_corpus_dir}')
    
    # Save final corpus (only 'tag' column)
    final_corpus_path = os.path.join(final_corpus_dir, 'final_corpus.parquet')
    corpus_df.to_parquet(final_corpus_path, index=False)
    print(f'💾 Saved final corpus: {final_corpus_path}')
    
    # Create summary
    summary = {
        'total_codes': len(corpus_df),
        'unique_codes': corpus_df['tag'].nunique(),
        'iteration_dir': iteration_dir,
        'corpus_source': corpus_source,
        'topological_sort_used': topological_sort_used,
        'timestamp': datetime.now().isoformat(),
        'columns': list(corpus_df.columns),
        'refinement_info': {
            'used_refined_codes': topological_sort_used,
            'source_file': topological_sort_path if topological_sort_used else (enhanced_corpus_path if 'enhanced' in corpus_source else build_corpus_path)
        }
    }
    
    summary_path = os.path.join(final_corpus_dir, 'final_corpus_summary.json')
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f'📊 Created summary: {summary_path}')
    print()
    
    print('✅ FINAL CORPUS GENERATION COMPLETE!')
    print(f'   📁 Directory: {final_corpus_dir}')
    print(f'   📄 Corpus file: {final_corpus_path}')
    print(f'   📊 Total codes: {summary["total_codes"]}')
    print(f'   �� Unique codes: {summary["unique_codes"]}')
    print(f'   📋 Source: {corpus_source}')
    print(f'   🌳 Used topological sort: {topological_sort_used}')
    print(f'   📋 Columns: {summary["columns"]}')
    
    return True

def main():
    if len(sys.argv) != 2:
        print("Usage: python generate_final_corpus.py <iteration_number>")
        print("Example: python generate_final_corpus.py 1")
        sys.exit(1)
    
    try:
        iteration_number = int(sys.argv[1])
        success = generate_final_corpus(iteration_number)
        sys.exit(0 if success else 1)
    except ValueError:
        print(f"Error: '{sys.argv[1]}' is not a valid iteration number")
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()
